From 236ebf7fabbb3dc07b231b9a74c378c40540b1b4 Mon Sep 17 00:00:00 2001 From: Steve Plimpton <sjplimp@sandia.gov> Date: Thu, 8 Sep 2016 13:56:18 -0600 Subject: [PATCH] Kokkos lib update --- lib/kokkos/CMakeLists.txt | 54 +- lib/kokkos/Makefile.kokkos | 23 +- lib/kokkos/Makefile.targets | 14 +- lib/kokkos/README | 31 +- lib/kokkos/algorithms/src/Kokkos_Random.hpp | 349 +- .../algorithms/unit_tests/TestRandom.hpp | 35 +- lib/kokkos/cmake/deps/CUDA.cmake | 79 + lib/kokkos/cmake/deps/CUSPARSE.cmake | 64 + lib/kokkos/cmake/deps/HWLOC.cmake | 70 + lib/kokkos/cmake/deps/Pthread.cmake | 83 + lib/kokkos/cmake/deps/QTHREAD.cmake | 70 + lib/kokkos/cmake/tribits.cmake | 485 +++ .../kokkos-trilinos-integration-procedure.txt | 153 + lib/kokkos/config/master_history.txt | 3 + lib/kokkos/config/nvcc_wrapper | 46 +- lib/kokkos/config/test_all_sandia | 201 +- .../performance_tests/CMakeLists.txt | 17 +- .../containers/performance_tests/TestCuda.cpp | 9 + .../performance_tests/TestDynRankView.hpp | 265 ++ .../performance_tests/TestGlobal2LocalIds.hpp | 2 +- .../performance_tests/TestOpenMP.cpp | 9 + .../performance_tests/TestThreads.cpp | 9 + .../TestUnorderedMapPerformance.hpp | 4 +- .../containers/src/Kokkos_DynRankView.hpp | 1367 +++++-- .../containers/src/Kokkos_DynamicView.hpp | 9 +- .../src/impl/Kokkos_Bitset_impl.hpp | 110 +- .../containers/unit_tests/TestDynViewAPI.hpp | 374 +- .../containers/unit_tests/TestDynamicView.hpp | 7 +- lib/kokkos/core/cmake/KokkosCore_config.h.in | 1 + lib/kokkos/core/perf_test/CMakeLists.txt | 15 +- lib/kokkos/core/perf_test/PerfTestCuda.cpp | 2 +- .../core/perf_test/PerfTestGramSchmidt.hpp | 2 +- lib/kokkos/core/perf_test/PerfTestHexGrad.hpp | 2 +- lib/kokkos/core/perf_test/test_atomic.cpp | 19 +- lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp | 38 +- lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp | 163 +- .../core/src/Cuda/Kokkos_Cuda_Alloc.hpp | 1 - .../src/Cuda/Kokkos_Cuda_BasicAllocators.cpp | 198 - .../src/Cuda/Kokkos_Cuda_BasicAllocators.hpp | 190 - lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp | 111 +- .../core/src/Cuda/Kokkos_Cuda_Internal.hpp | 40 +- .../core/src/Cuda/Kokkos_Cuda_Parallel.hpp | 774 ++-- .../core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp | 18 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp | 179 + lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp | 519 +++ .../core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp | 12 +- .../core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp | 24 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp | 339 -- .../core/src/KokkosExp_MDRangePolicy.hpp | 611 +++ lib/kokkos/core/src/KokkosExp_View.hpp | 2306 ------------ lib/kokkos/core/src/Kokkos_Complex.hpp | 15 +- ...asicAllocators.hpp => Kokkos_Concepts.hpp} | 99 +- lib/kokkos/core/src/Kokkos_Core.hpp | 72 - lib/kokkos/core/src/Kokkos_Core_fwd.hpp | 7 +- lib/kokkos/core/src/Kokkos_Cuda.hpp | 15 +- lib/kokkos/core/src/Kokkos_CudaSpace.hpp | 93 +- lib/kokkos/core/src/Kokkos_ExecPolicy.hpp | 231 +- lib/kokkos/core/src/Kokkos_HBWSpace.hpp | 15 - lib/kokkos/core/src/Kokkos_HostSpace.hpp | 22 - lib/kokkos/core/src/Kokkos_Macros.hpp | 31 +- lib/kokkos/core/src/Kokkos_MemoryPool.hpp | 1701 +++++++-- lib/kokkos/core/src/Kokkos_OpenMP.hpp | 11 +- lib/kokkos/core/src/Kokkos_Pair.hpp | 25 +- lib/kokkos/core/src/Kokkos_Parallel.hpp | 431 +-- .../core/src/Kokkos_Parallel_Reduce.hpp | 1240 ++++++ lib/kokkos/core/src/Kokkos_ScratchSpace.hpp | 68 +- lib/kokkos/core/src/Kokkos_Serial.hpp | 185 +- lib/kokkos/core/src/Kokkos_TaskPolicy.hpp | 652 +++- lib/kokkos/core/src/Kokkos_Threads.hpp | 10 +- lib/kokkos/core/src/Kokkos_View.hpp | 3322 +++++++++-------- .../src/OpenMP/Kokkos_OpenMP_Parallel.hpp | 120 +- .../core/src/OpenMP/Kokkos_OpenMP_Task.cpp | 329 ++ .../core/src/OpenMP/Kokkos_OpenMP_Task.hpp | 356 ++ .../core/src/OpenMP/Kokkos_OpenMPexec.cpp | 32 +- .../core/src/OpenMP/Kokkos_OpenMPexec.hpp | 138 +- .../core/src/Qthread/Kokkos_QthreadExec.cpp | 2 +- .../core/src/Qthread/Kokkos_QthreadExec.hpp | 32 +- .../src/Qthread/Kokkos_Qthread_Parallel.hpp | 110 +- .../src/Qthread/Kokkos_Qthread_TaskPolicy.cpp | 37 +- .../src/Qthread/Kokkos_Qthread_TaskPolicy.hpp | 107 +- lib/kokkos/core/src/Qthread/README | 21 +- .../core/src/Threads/Kokkos_ThreadsExec.cpp | 43 +- .../core/src/Threads/Kokkos_ThreadsExec.hpp | 14 - .../core/src/Threads/Kokkos_ThreadsTeam.hpp | 58 +- .../src/Threads/Kokkos_Threads_Parallel.hpp | 100 +- .../src/Threads/Kokkos_Threads_TaskPolicy.cpp | 14 +- .../src/Threads/Kokkos_Threads_TaskPolicy.hpp | 5 +- .../core/src/impl/KokkosExp_SharedAlloc.hpp | 2 +- .../core/src/impl/KokkosExp_ViewCtor.hpp | 4 - .../core/src/impl/KokkosExp_ViewMapping.hpp | 24 +- .../src/impl/Kokkos_AllocationTracker.cpp | 848 ----- .../src/impl/Kokkos_AllocationTracker.hpp | 574 --- .../core/src/impl/Kokkos_AnalyzePolicy.hpp | 197 + .../Kokkos_Atomic_Compare_Exchange_Strong.hpp | 22 +- .../core/src/impl/Kokkos_Atomic_Exchange.hpp | 24 +- .../core/src/impl/Kokkos_Atomic_Fetch_Add.hpp | 41 +- .../core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp | 2 +- .../core/src/impl/Kokkos_Atomic_Generic.hpp | 40 + .../core/src/impl/Kokkos_Atomic_View.hpp | 36 - .../core/src/impl/Kokkos_BasicAllocators.cpp | 287 -- lib/kokkos/core/src/impl/Kokkos_BitOps.hpp | 122 + lib/kokkos/core/src/impl/Kokkos_Core.cpp | 4 +- .../core/src/impl/Kokkos_FunctorAdapter.hpp | 120 +- lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp | 18 - lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp | 16 - .../src/impl/Kokkos_MemoryPool_Inline.hpp | 446 --- .../core/src/impl/Kokkos_PhysicalLayout.hpp | 13 +- .../src/impl/Kokkos_Profiling_DeviceInfo.hpp | 2 +- .../src/impl/Kokkos_Profiling_Interface.cpp | 13 +- .../src/impl/Kokkos_Profiling_Interface.hpp | 2 +- .../core/src/impl/Kokkos_Serial_Task.cpp | 147 + .../core/src/impl/Kokkos_Serial_Task.hpp | 271 ++ .../src/impl/Kokkos_Serial_TaskPolicy.cpp | 18 +- .../src/impl/Kokkos_Serial_TaskPolicy.hpp | 10 +- lib/kokkos/core/src/impl/Kokkos_Tags.hpp | 137 +- lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp | 499 +++ .../core/src/impl/Kokkos_TaskQueue_impl.hpp | 569 +++ lib/kokkos/core/src/impl/Kokkos_Timer.hpp | 3 + lib/kokkos/core/src/impl/Kokkos_Traits.hpp | 25 +- .../core/src/impl/Kokkos_ViewSupport.hpp | 129 - .../core/src/impl/Kokkos_ViewTileLeft.hpp | 153 - lib/kokkos/core/unit_test/CMakeLists.txt | 19 +- lib/kokkos/core/unit_test/Makefile | 23 +- lib/kokkos/core/unit_test/TestAggregate.hpp | 661 ---- .../core/unit_test/TestAggregateReduction.hpp | 14 +- .../core/unit_test/TestAllocationTracker.cpp | 159 - lib/kokkos/core/unit_test/TestAtomic.hpp | 11 +- .../core/unit_test/TestAtomicOperations.hpp | 841 +++++ lib/kokkos/core/unit_test/TestCuda.cpp | 361 +- lib/kokkos/core/unit_test/TestCuda_a.cpp | 443 +-- lib/kokkos/core/unit_test/TestCuda_b.cpp | 486 +-- lib/kokkos/core/unit_test/TestCuda_c.cpp | 480 +-- ...Init.cpp => TestDefaultDeviceTypeInit.hpp} | 35 +- .../unit_test/TestDefaultDeviceTypeInit_1.cpp | 2 + .../TestDefaultDeviceTypeInit_10.cpp | 2 + .../TestDefaultDeviceTypeInit_11.cpp | 2 + .../TestDefaultDeviceTypeInit_12.cpp | 2 + .../TestDefaultDeviceTypeInit_13.cpp | 2 + .../TestDefaultDeviceTypeInit_14.cpp | 2 + .../TestDefaultDeviceTypeInit_15.cpp | 2 + .../TestDefaultDeviceTypeInit_16.cpp | 2 + .../unit_test/TestDefaultDeviceTypeInit_2.cpp | 2 + .../unit_test/TestDefaultDeviceTypeInit_3.cpp | 2 + .../unit_test/TestDefaultDeviceTypeInit_4.cpp | 2 + .../unit_test/TestDefaultDeviceTypeInit_5.cpp | 2 + .../unit_test/TestDefaultDeviceTypeInit_6.cpp | 2 + .../unit_test/TestDefaultDeviceTypeInit_7.cpp | 2 + .../unit_test/TestDefaultDeviceTypeInit_8.cpp | 2 + .../unit_test/TestDefaultDeviceTypeInit_9.cpp | 2 + .../TestDefaultDeviceType_a.cpp} | 42 +- lib/kokkos/core/unit_test/TestMDRange.hpp | 555 +++ lib/kokkos/core/unit_test/TestMemoryPool.hpp | 558 ++- lib/kokkos/core/unit_test/TestOpenMP.cpp | 78 + lib/kokkos/core/unit_test/TestOpenMP_a.cpp | 23 +- lib/kokkos/core/unit_test/TestOpenMP_b.cpp | 54 +- lib/kokkos/core/unit_test/TestOpenMP_c.cpp | 62 +- .../core/unit_test/TestPolicyConstruction.hpp | 26 +- lib/kokkos/core/unit_test/TestQthread.cpp | 12 +- lib/kokkos/core/unit_test/TestRange.hpp | 5 +- lib/kokkos/core/unit_test/TestReduce.hpp | 1408 +++++++ lib/kokkos/core/unit_test/TestSerial.cpp | 131 +- lib/kokkos/core/unit_test/TestTaskPolicy.hpp | 516 ++- lib/kokkos/core/unit_test/TestTeam.hpp | 291 +- lib/kokkos/core/unit_test/TestThreads.cpp | 120 +- lib/kokkos/core/unit_test/TestViewAPI.hpp | 4 +- lib/kokkos/example/fenl/CGSolve.hpp | 4 +- lib/kokkos/example/fenl/fenl_functors.hpp | 2 +- lib/kokkos/example/fenl/fenl_impl.hpp | 2 +- lib/kokkos/example/global_2_local_ids/G2L.hpp | 2 +- .../example_chol_performance_device.hpp | 2 +- lib/kokkos/example/md_skeleton/main.cpp | 4 +- lib/kokkos/example/multi_fem/Explicit.hpp | 2 +- lib/kokkos/example/multi_fem/Implicit.hpp | 2 +- lib/kokkos/example/multi_fem/Nonlinear.hpp | 2 +- .../example/multi_fem/SparseLinearSystem.hpp | 2 +- lib/kokkos/example/sort_array/CMakeLists.txt | 1 - lib/kokkos/example/sort_array/sort_array.hpp | 2 +- .../example/tutorial/01_hello_world/Makefile | 2 +- .../tutorial/01_hello_world_lambda/Makefile | 2 +- .../tutorial/02_simple_reduce/Makefile | 2 +- .../tutorial/02_simple_reduce_lambda/Makefile | 2 +- .../example/tutorial/03_simple_view/Makefile | 2 +- .../tutorial/03_simple_view_lambda/Makefile | 2 +- .../tutorial/04_simple_memoryspaces/Makefile | 2 +- .../tutorial/05_simple_atomics/Makefile | 2 +- .../Advanced_Views/01_data_layouts/Makefile | 2 +- .../01_data_layouts/data_layouts.cpp | 4 +- .../Advanced_Views/02_memory_traits/Makefile | 2 +- .../02_memory_traits/memory_traits.cpp | 4 +- .../Advanced_Views/03_subviews/Makefile | 2 +- .../Advanced_Views/04_dualviews/Makefile | 2 +- .../Advanced_Views/04_dualviews/dual_view.cpp | 10 +- .../Advanced_Views/05_NVIDIA_UVM/Makefile | 2 +- .../05_NVIDIA_UVM/uvm_example.cpp | 2 +- .../Advanced_Views/06_AtomicViews/Makefile | 2 +- .../07_Overlapping_DeepCopy/Makefile | 2 +- .../overlapping_deepcopy.cpp | 2 +- .../Algorithms/01_random_numbers/Makefile | 2 +- .../01_random_numbers/random_numbers.cpp | 2 +- .../01_thread_teams/Makefile | 2 +- .../01_thread_teams_lambda/Makefile | 3 +- .../thread_teams_lambda.cpp | 3 +- .../02_nested_parallel_for/Makefile | 2 +- .../03_vectorization/Makefile | 2 +- .../03_vectorization/vectorization.cpp | 6 +- .../04_team_scan/Makefile | 2 +- .../04_team_scan/team_scan.cpp | 2 +- lib/kokkos/generate_makefile.bash | 2 +- src/MANYBODY/pair_vashishta.cpp | 2 +- src/MANYBODY/pair_vashishta.h | 14 +- src/fix_nve_sphere.cpp | 31 +- src/group.cpp | 113 +- 212 files changed, 18620 insertions(+), 13184 deletions(-) create mode 100644 lib/kokkos/cmake/deps/CUDA.cmake create mode 100644 lib/kokkos/cmake/deps/CUSPARSE.cmake create mode 100644 lib/kokkos/cmake/deps/HWLOC.cmake create mode 100644 lib/kokkos/cmake/deps/Pthread.cmake create mode 100644 lib/kokkos/cmake/deps/QTHREAD.cmake create mode 100644 lib/kokkos/cmake/tribits.cmake create mode 100644 lib/kokkos/config/kokkos-trilinos-integration-procedure.txt create mode 100644 lib/kokkos/config/master_history.txt create mode 100644 lib/kokkos/containers/performance_tests/TestDynRankView.hpp delete mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp delete mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp create mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp create mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp create mode 100644 lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp delete mode 100644 lib/kokkos/core/src/KokkosExp_View.hpp rename lib/kokkos/core/src/{impl/Kokkos_BasicAllocators.hpp => Kokkos_Concepts.hpp} (56%) create mode 100644 lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp create mode 100644 lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp create mode 100644 lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp create mode 100644 lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp create mode 100644 lib/kokkos/core/src/impl/Kokkos_BitOps.hpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_MemoryPool_Inline.hpp create mode 100644 lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp create mode 100644 lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp create mode 100644 lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp create mode 100644 lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp delete mode 100644 lib/kokkos/core/unit_test/TestAllocationTracker.cpp create mode 100644 lib/kokkos/core/unit_test/TestAtomicOperations.hpp rename lib/kokkos/core/unit_test/{TestDefaultDeviceTypeInit.cpp => TestDefaultDeviceTypeInit.hpp} (93%) create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_1.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_10.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_11.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_12.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_13.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_14.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_15.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_16.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_2.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_3.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_4.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_5.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_6.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_7.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_8.cpp create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_9.cpp rename lib/kokkos/core/{src/impl/Kokkos_MemoryPool.cpp => unit_test/TestDefaultDeviceType_a.cpp} (77%) create mode 100644 lib/kokkos/core/unit_test/TestMDRange.hpp diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index f45fc8d9fc..1219352f73 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -1,4 +1,15 @@ +IF(COMMAND TRIBITS_PACKAGE_DECL) + SET(KOKKOS_HAS_TRILINOS ON CACHE BOOL "") +ELSE() + SET(KOKKOS_HAS_TRILINOS OFF CACHE BOOL "") +ENDIF() + +IF(NOT KOKKOS_HAS_TRILINOS) + CMAKE_MINIMUM_REQUIRED(VERSION 2.8.11 FATAL_ERROR) + INCLUDE(cmake/tribits.cmake) +ENDIF() + # # A) Forward delcare the package so that certain options are also defined for # subpackages @@ -12,7 +23,22 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS) # subpackages as well. # -TRIBITS_ADD_DEBUG_OPTION() + + +# mfh 01 Aug 2016: See Issue #61: +# +# https://github.com/kokkos/kokkos/issues/61 +# +# Don't use TRIBITS_ADD_DEBUG_OPTION() here, because that defines +# HAVE_KOKKOS_DEBUG. We define KOKKOS_HAVE_DEBUG here instead, +# for compatibility with Kokkos' Makefile build system. + +TRIBITS_ADD_OPTION_AND_DEFINE( + ${PACKAGE_NAME}_ENABLE_DEBUG + ${PACKAGE_NAME_UC}_HAVE_DEBUG + "Enable run-time debug checks. These checks may be expensive, so they are disabled by default in a release build." + ${${PROJECT_NAME}_ENABLE_DEBUG} +) TRIBITS_ADD_OPTION_AND_DEFINE( Kokkos_ENABLE_SIERRA_BUILD @@ -82,11 +108,33 @@ TRIBITS_ADD_OPTION_AND_DEFINE( "${TPL_ENABLE_MPI}" ) +# Set default value of Kokkos_ENABLE_Debug_Bounds_Check option +# +# CMake is case sensitive. The Kokkos_ENABLE_Debug_Bounds_Check +# option (defined below) is annoyingly not all caps, but we need to +# keep it that way for backwards compatibility. If users forget and +# try using an all-caps variable, then make it count by using the +# all-caps version as the default value of the original, not-all-caps +# option. Otherwise, the default value of this option comes from +# Kokkos_ENABLE_DEBUG (see Issue #367). + +ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_DEBUG) +IF(DEFINED Kokkos_ENABLE_DEBUG_BOUNDS_CHECK) + IF(Kokkos_ENABLE_DEBUG_BOUNDS_CHECK) + SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT ON) + ELSE() + SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}") + ENDIF() +ELSE() + SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}") +ENDIF() +ASSERT_DEFINED(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT) + TRIBITS_ADD_OPTION_AND_DEFINE( Kokkos_ENABLE_Debug_Bounds_Check KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - "Enable bounds checking support in Kokkos." - OFF + "Enable Kokkos::View run-time bounds checking." + "${Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT}" ) TRIBITS_ADD_OPTION_AND_DEFINE( diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index c01ceaf64d..c9b6cc464d 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -7,7 +7,7 @@ CXXFLAGS=$(CCFLAGS) #Options: OpenMP,Serial,Pthreads,Cuda KOKKOS_DEVICES ?= "OpenMP" #KOKKOS_DEVICES ?= "Pthreads" -#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8,KNL +#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv8,BGQ,Power7,Power8,KNL,BDW KOKKOS_ARCH ?= "" #Options: yes,no KOKKOS_DEBUG ?= "no" @@ -97,6 +97,7 @@ KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l)) KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l)) KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l)) KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l)) #NVIDIA based @@ -108,10 +109,12 @@ KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l)) KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l)) KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l)) KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ + + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc)) @@ -123,6 +126,7 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ + + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc)) @@ -142,11 +146,11 @@ KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AM #Any AVX? KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc )) -KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc )) +KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc )) # Decide what ISA level we are able to support -KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc )) +KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc )) KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc )) KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc )) @@ -304,8 +308,8 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp ) - KOKKOS_CXXFLAGS += -mcpu=power8 - KOKKOS_LDFLAGS += -mcpu=power8 + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1) @@ -321,8 +325,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1) else # Assume that this is a really a GNU compiler - KOKKOS_CXXFLAGS += -march=core-avx2 - KOKKOS_LDFLAGS += -march=core-avx2 + KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2 + KOKKOS_LDFLAGS += -march=core-avx2 -mtune=core-avx2 endif endif endif @@ -390,6 +394,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp ) KOKKOS_CXXFLAGS += -arch=sm_53 endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) + tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp ) + KOKKOS_CXXFLAGS += -arch=sm_61 +endif endif KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h) diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index 876ae033b7..86929ea0fe 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -1,9 +1,5 @@ Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp -Kokkos_AllocationTracker.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp -Kokkos_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp @@ -20,6 +16,10 @@ Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Seria $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp +Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp +Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp @@ -32,12 +32,12 @@ Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_M $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) -Kokkos_Cuda_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp +Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp endif @@ -61,6 +61,8 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp +Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp endif Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp diff --git a/lib/kokkos/README b/lib/kokkos/README index 25b3778d95..b094578af6 100644 --- a/lib/kokkos/README +++ b/lib/kokkos/README @@ -37,7 +37,7 @@ hcedwar(at)sandia.gov and crtrott(at)sandia.gov ====Requirements============================================================ ============================================================================ -Primary tested compilers are: +Primary tested compilers on X86 are: GCC 4.7.2 GCC 4.8.4 GCC 4.9.2 @@ -48,26 +48,43 @@ Primary tested compilers are: Clang 3.5.2 Clang 3.6.1 +Primary tested compilers on Power 8 are: + IBM XL 13.1.3 (OpenMP,Serial) + GCC 4.9.2 (OpenMP,Serial) + GCC 5.3.0 (OpenMP,Serial) + Secondary tested compilers are: CUDA 6.5 (with gcc 4.7.2) CUDA 7.0 (with gcc 4.7.2) CUDA 7.5 (with gcc 4.8.4) Other compilers working: - PGI 15.4 - IBM XL 13.1.2 - Cygwin 2.1.0 64bit with gcc 4.9.3 + X86: + Intel 17.0.042 (the FENL example causes internal compiler error) + PGI 15.4 + Cygwin 2.1.0 64bit with gcc 4.9.3 + KNL: + Intel 16.2.181 (the FENL example causes internal compiler error) + Intel 17.0.042 (the FENL example causes internal compiler error) + +Known non-working combinations: + Power8: + GCC 6.1.0 + Pthreads backend + Primary tested compiler are passing in release mode -with warnings as errors. We are using the following set -of flags: +with warnings as errors. They also are tested with a comprehensive set of +backend combinations (i.e. OpenMP, Pthreads, Serial, OpenMP+Serial, ...). +We are using the following set of flags: GCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized Secondary compilers are passing without -Werror. -Other compilers are tested occasionally. +Other compilers are tested occasionally, in particular when pushing from develop to +master branch, without -Werror and only for a select set of backends. ============================================================================ ====Getting started========================================================= diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 192b1d64f8..d7c06dc14b 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -771,6 +771,7 @@ namespace Kokkos { friend class Random_XorShift1024_Pool<DeviceType>; public: + typedef Random_XorShift1024_Pool<DeviceType> pool_type; typedef DeviceType device_type; enum {MAX_URAND = 0xffffffffU}; @@ -779,10 +780,10 @@ namespace Kokkos { enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)}; KOKKOS_INLINE_FUNCTION - Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0): + Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0): p_(p),state_idx_(state_idx){ for(int i=0 ; i<16; i++) - state_[i] = state[i]; + state_[i] = state(state_idx,i); } KOKKOS_INLINE_FUNCTION @@ -933,6 +934,7 @@ namespace Kokkos { state_data_type state_; int_view_type p_; int num_states_; + friend class Random_XorShift1024<DeviceType>; public: typedef Random_XorShift1024<DeviceType> generator_type; @@ -1001,7 +1003,7 @@ namespace Kokkos { KOKKOS_INLINE_FUNCTION Random_XorShift1024<DeviceType> get_state() const { const int i = DeviceType::hardware_thread_id(); - return Random_XorShift1024<DeviceType>(&state_(i,0),p_(i),i); + return Random_XorShift1024<DeviceType>(state_,p_(i),i); }; KOKKOS_INLINE_FUNCTION @@ -1020,10 +1022,12 @@ namespace Kokkos { int p_; const int state_idx_; uint64_t* state_; + const int stride_; friend class Random_XorShift1024_Pool<Kokkos::Cuda>; public: typedef Kokkos::Cuda device_type; + typedef Random_XorShift1024_Pool<device_type> pool_type; enum {MAX_URAND = 0xffffffffU}; enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; @@ -1031,30 +1035,30 @@ namespace Kokkos { enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)}; KOKKOS_INLINE_FUNCTION - Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0): - p_(p),state_idx_(state_idx),state_(state){ + Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0): + p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){ } KOKKOS_INLINE_FUNCTION uint32_t urand() { - uint64_t state_0 = state_[ p_ ]; - uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; + uint64_t state_0 = state_[ p_ * stride_ ]; + uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ]; state_1 ^= state_1 << 31; state_1 ^= state_1 >> 11; state_0 ^= state_0 >> 30; - uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL; + uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL; tmp = tmp>>16; return static_cast<uint32_t>(tmp&MAX_URAND); } KOKKOS_INLINE_FUNCTION uint64_t urand64() { - uint64_t state_0 = state_[ p_ ]; - uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; + uint64_t state_0 = state_[ p_ * stride_ ]; + uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ]; state_1 ^= state_1 << 31; state_1 ^= state_1 >> 11; state_0 ^= state_0 >> 30; - return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1; + return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1; } KOKKOS_INLINE_FUNCTION @@ -1227,9 +1231,9 @@ Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_st if(i>=num_states_) {i = i_offset;} } - return Random_XorShift1024<Kokkos::Cuda>(&state_(i,0), p_(i), i); + return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i); #else - return Random_XorShift1024<Kokkos::Cuda>(&state_(0,0), p_(0), 0); + return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0); #endif } @@ -1248,14 +1252,15 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102 #endif +namespace Impl { -template<class ViewType, class RandomPool, int loops, int rank> +template<class ViewType, class RandomPool, int loops, int rank, class IndexType> struct fill_random_functor_range; -template<class ViewType, class RandomPool, int loops, int rank> +template<class ViewType, class RandomPool, int loops, int rank, class IndexType> struct fill_random_functor_begin_end; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_range<ViewType,RandomPool,loops,1>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType,RandomPool,loops,1,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1268,19 +1273,19 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,1>{ a(a_),rand_pool(rand_pool_),range(range_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (const IndexType& i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) a(idx) = Rand::draw(gen,range); } rand_pool.free_state(gen); } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_range<ViewType,RandomPool,loops,2>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType,RandomPool,loops,2,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1293,12 +1298,12 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{ a(a_),rand_pool(rand_pool_),range(range_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) a(idx,k) = Rand::draw(gen,range); } } @@ -1307,8 +1312,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{ }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_range<ViewType,RandomPool,loops,3>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType,RandomPool,loops,3,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1321,13 +1326,13 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{ a(a_),rand_pool(rand_pool_),range(range_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) - for(unsigned int l=0;l<a.dimension_2();l++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++) a(idx,k,l) = Rand::draw(gen,range); } } @@ -1335,8 +1340,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{ } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_range<ViewType,RandomPool,loops,4>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType,RandomPool,loops,4, IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1349,14 +1354,14 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{ a(a_),rand_pool(rand_pool_),range(range_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) - for(unsigned int l=0;l<a.dimension_2();l++) - for(unsigned int m=0;m<a.dimension_3();m++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++) + for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++) a(idx,k,l,m) = Rand::draw(gen,range); } } @@ -1364,8 +1369,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{ } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_range<ViewType,RandomPool,loops,5>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType,RandomPool,loops,5,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1378,15 +1383,15 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{ a(a_),rand_pool(rand_pool_),range(range_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) - for(unsigned int l=0;l<a.dimension_2();l++) - for(unsigned int m=0;m<a.dimension_3();m++) - for(unsigned int n=0;n<a.dimension_4();n++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++) + for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++) + for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++) a(idx,k,l,m,n) = Rand::draw(gen,range); } } @@ -1394,8 +1399,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{ } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_range<ViewType,RandomPool,loops,6>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType,RandomPool,loops,6,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1408,16 +1413,16 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{ a(a_),rand_pool(rand_pool_),range(range_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) - for(unsigned int l=0;l<a.dimension_2();l++) - for(unsigned int m=0;m<a.dimension_3();m++) - for(unsigned int n=0;n<a.dimension_4();n++) - for(unsigned int o=0;o<a.dimension_5();o++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++) + for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++) + for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++) + for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++) a(idx,k,l,m,n,o) = Rand::draw(gen,range); } } @@ -1425,8 +1430,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{ } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_range<ViewType,RandomPool,loops,7>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType,RandomPool,loops,7,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1439,17 +1444,17 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{ a(a_),rand_pool(rand_pool_),range(range_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) - for(unsigned int l=0;l<a.dimension_2();l++) - for(unsigned int m=0;m<a.dimension_3();m++) - for(unsigned int n=0;n<a.dimension_4();n++) - for(unsigned int o=0;o<a.dimension_5();o++) - for(unsigned int p=0;p<a.dimension_6();p++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++) + for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++) + for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++) + for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++) + for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++) a(idx,k,l,m,n,o,p) = Rand::draw(gen,range); } } @@ -1457,8 +1462,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{ } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_range<ViewType,RandomPool,loops,8>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType,RandomPool,loops,8,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1471,26 +1476,26 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,8>{ a(a_),rand_pool(rand_pool_),range(range_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) - for(unsigned int l=0;l<a.dimension_2();l++) - for(unsigned int m=0;m<a.dimension_3();m++) - for(unsigned int n=0;n<a.dimension_4();n++) - for(unsigned int o=0;o<a.dimension_5();o++) - for(unsigned int p=0;p<a.dimension_6();p++) - for(unsigned int q=0;q<a.dimension_7();q++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++) + for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++) + for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++) + for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++) + for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++) + for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++) a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range); } } rand_pool.free_state(gen); } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1503,19 +1508,19 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{ a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) a(idx) = Rand::draw(gen,begin,end); } rand_pool.free_state(gen); } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1528,12 +1533,12 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{ a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) a(idx,k) = Rand::draw(gen,begin,end); } } @@ -1542,8 +1547,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{ }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1556,13 +1561,13 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{ a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) - for(unsigned int l=0;l<a.dimension_2();l++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++) a(idx,k,l) = Rand::draw(gen,begin,end); } } @@ -1570,8 +1575,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{ } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1584,14 +1589,14 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{ a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) - for(unsigned int l=0;l<a.dimension_2();l++) - for(unsigned int m=0;m<a.dimension_3();m++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++) + for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++) a(idx,k,l,m) = Rand::draw(gen,begin,end); } } @@ -1599,8 +1604,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{ } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1613,15 +1618,15 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{ a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()){ - for(unsigned int l=0;l<a.dimension_1();l++) - for(unsigned int m=0;m<a.dimension_2();m++) - for(unsigned int n=0;n<a.dimension_3();n++) - for(unsigned int o=0;o<a.dimension_4();o++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())){ + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_1());l++) + for(IndexType m=0;m<static_cast<IndexType>(a.dimension_2());m++) + for(IndexType n=0;n<static_cast<IndexType>(a.dimension_3());n++) + for(IndexType o=0;o<static_cast<IndexType>(a.dimension_4());o++) a(idx,l,m,n,o) = Rand::draw(gen,begin,end); } } @@ -1629,8 +1634,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{ } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1643,16 +1648,16 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{ a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) - for(unsigned int l=0;l<a.dimension_2();l++) - for(unsigned int m=0;m<a.dimension_3();m++) - for(unsigned int n=0;n<a.dimension_4();n++) - for(unsigned int o=0;o<a.dimension_5();o++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++) + for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++) + for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++) + for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++) a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end); } } @@ -1661,8 +1666,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{ }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1675,17 +1680,17 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{ a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) - for(unsigned int l=0;l<a.dimension_2();l++) - for(unsigned int m=0;m<a.dimension_3();m++) - for(unsigned int n=0;n<a.dimension_4();n++) - for(unsigned int o=0;o<a.dimension_5();o++) - for(unsigned int p=0;p<a.dimension_6();p++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++) + for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++) + for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++) + for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++) + for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++) a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end); } } @@ -1693,8 +1698,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{ } }; -template<class ViewType, class RandomPool, int loops> -struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{ +template<class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8,IndexType>{ typedef typename ViewType::execution_space execution_space; ViewType a; RandomPool rand_pool; @@ -1707,18 +1712,18 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{ a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} KOKKOS_INLINE_FUNCTION - void operator() (unsigned int i) const { + void operator() (IndexType i) const { typename RandomPool::generator_type gen = rand_pool.get_state(); - for(unsigned int j=0;j<loops;j++) { - const uint64_t idx = i*loops+j; - if(idx<a.dimension_0()) { - for(unsigned int k=0;k<a.dimension_1();k++) - for(unsigned int l=0;l<a.dimension_2();l++) - for(unsigned int m=0;m<a.dimension_3();m++) - for(unsigned int n=0;n<a.dimension_4();n++) - for(unsigned int o=0;o<a.dimension_5();o++) - for(unsigned int p=0;p<a.dimension_6();p++) - for(unsigned int q=0;q<a.dimension_7();q++) + for(IndexType j=0;j<loops;j++) { + const IndexType idx = i*loops+j; + if(idx<static_cast<IndexType>(a.dimension_0())) { + for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++) + for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++) + for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++) + for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++) + for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++) + for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++) + for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++) a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end); } } @@ -1726,18 +1731,20 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{ } }; -template<class ViewType, class RandomPool> +} + +template<class ViewType, class RandomPool, class IndexType = int64_t> void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) { int64_t LDA = a.dimension_0(); if(LDA>0) - parallel_for((LDA+127)/128,fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank>(a,g,range)); + parallel_for((LDA+127)/128,Impl::fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,range)); } -template<class ViewType, class RandomPool> +template<class ViewType, class RandomPool, class IndexType = int64_t> void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) { int64_t LDA = a.dimension_0(); if(LDA>0) - parallel_for((LDA+127)/128,fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank>(a,g,begin,end)); + parallel_for((LDA+127)/128,Impl::fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,begin,end)); } } diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp index eade74ed93..c906b9f2cd 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -50,6 +50,7 @@ #include <Kokkos_Core.hpp> #include <Kokkos_Random.hpp> #include <cmath> +#include <chrono> namespace Test { @@ -207,7 +208,6 @@ struct test_histogram1d_functor { density_1d (d1d), mean (1.0*num_draws/HIST_DIM1D*3) { - printf ("Mean: %e\n", mean); } KOKKOS_INLINE_FUNCTION void @@ -295,7 +295,7 @@ struct test_random_scalar { parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result); //printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2); - double tolerance = 2.0*sqrt(1.0/num_draws); + double tolerance = 1.6*sqrt(1.0/num_draws); double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max(); double variance_expect = 1.0/3.0*mean_expect*mean_expect; double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0; @@ -303,10 +303,10 @@ struct test_random_scalar { double covariance_eps = result.covariance/num_draws/2/variance_expect; pass_mean = ((-tolerance < mean_eps) && ( tolerance > mean_eps)) ? 1:0; - pass_var = ((-tolerance < variance_eps) && - ( tolerance > variance_eps)) ? 1:0; - pass_covar = ((-1.4*tolerance < covariance_eps) && - ( 1.4*tolerance > covariance_eps)) ? 1:0; + pass_var = ((-1.5*tolerance < variance_eps) && + ( 1.5*tolerance > variance_eps)) ? 1:0; + pass_covar = ((-2.0*tolerance < covariance_eps) && + ( 2.0*tolerance > covariance_eps)) ? 1:0; cerr << "Pass: " << pass_mean << " " << pass_var << " " << mean_eps @@ -328,12 +328,12 @@ struct test_random_scalar { double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0; double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0; double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect; - pass_hist1d_mean = ((-tolerance < mean_eps) && - ( tolerance > mean_eps)) ? 1:0; - pass_hist1d_var = ((-tolerance < variance_eps) && - ( tolerance > variance_eps)) ? 1:0; - pass_hist1d_covar = ((-tolerance < covariance_eps) && - ( tolerance > covariance_eps)) ? 1:0; + pass_hist1d_mean = ((-0.0001 < mean_eps) && + ( 0.0001 > mean_eps)) ? 1:0; + pass_hist1d_var = ((-0.07 < variance_eps) && + ( 0.07 > variance_eps)) ? 1:0; + pass_hist1d_covar = ((-0.06 < covariance_eps) && + ( 0.06 > covariance_eps)) ? 1:0; cerr << "Density 1D: " << mean_eps << " " << variance_eps @@ -363,8 +363,8 @@ struct test_random_scalar { double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect; pass_hist3d_mean = ((-tolerance < mean_eps) && ( tolerance > mean_eps)) ? 1:0; - pass_hist3d_var = ((-tolerance < variance_eps) && - ( tolerance > variance_eps)) ? 1:0; + pass_hist3d_var = ((-1.2*tolerance < variance_eps) && + ( 1.2*tolerance > variance_eps)) ? 1:0; pass_hist3d_covar = ((-tolerance < covariance_eps) && ( tolerance > covariance_eps)) ? 1:0; @@ -386,8 +386,13 @@ void test_random(unsigned int num_draws) typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d"); typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d"); + + uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + cerr << "Test Seed:" << ticks << endl; + + RandomGenerator pool(ticks); + cerr << "Test Scalar=int" << endl; - RandomGenerator pool(31891); test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_int.pass_mean,1); ASSERT_EQ( test_int.pass_var,1); diff --git a/lib/kokkos/cmake/deps/CUDA.cmake b/lib/kokkos/cmake/deps/CUDA.cmake new file mode 100644 index 0000000000..801c20067b --- /dev/null +++ b/lib/kokkos/cmake/deps/CUDA.cmake @@ -0,0 +1,79 @@ +# @HEADER +# ************************************************************************ +# +# Trilinos: An Object-Oriented Solver Framework +# Copyright (2001) Sandia Corporation +# +# +# Copyright (2001) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000, there is a non-exclusive license for use of this +# work by or on behalf of the U.S. Government. Export of this program +# may require a license from the United States Government. +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# NOTICE: The United States Government is granted for itself and others +# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide +# license in this data to reproduce, prepare derivative works, and +# perform publicly and display publicly. Beginning five (5) years from +# July 25, 2001, the United States Government is granted for itself and +# others acting on its behalf a paid-up, nonexclusive, irrevocable +# worldwide license in this data to reproduce, prepare derivative works, +# distribute copies to the public, perform publicly and display +# publicly, and to permit others to do so. +# +# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT +# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES +# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR +# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY +# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS +# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# +# ************************************************************************ +# @HEADER + +# Check for CUDA support + +SET(_CUDA_FAILURE OFF) + +# Have CMake find CUDA +IF(NOT _CUDA_FAILURE) + FIND_PACKAGE(CUDA 3.2) + IF (NOT CUDA_FOUND) + SET(_CUDA_FAILURE ON) + ENDIF() +ENDIF() + +IF(NOT _CUDA_FAILURE) + # if we haven't met failure + macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target) + TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY) + endmacro() + GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) + GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) + GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) + TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) +ELSE() + SET(TPL_ENABLE_CUDA OFF) +ENDIF() diff --git a/lib/kokkos/cmake/deps/CUSPARSE.cmake b/lib/kokkos/cmake/deps/CUSPARSE.cmake new file mode 100644 index 0000000000..205f5e2a98 --- /dev/null +++ b/lib/kokkos/cmake/deps/CUSPARSE.cmake @@ -0,0 +1,64 @@ +# @HEADER +# ************************************************************************ +# +# Trilinos: An Object-Oriented Solver Framework +# Copyright (2001) Sandia Corporation +# +# +# Copyright (2001) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000, there is a non-exclusive license for use of this +# work by or on behalf of the U.S. Government. Export of this program +# may require a license from the United States Government. +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# NOTICE: The United States Government is granted for itself and others +# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide +# license in this data to reproduce, prepare derivative works, and +# perform publicly and display publicly. Beginning five (5) years from +# July 25, 2001, the United States Government is granted for itself and +# others acting on its behalf a paid-up, nonexclusive, irrevocable +# worldwide license in this data to reproduce, prepare derivative works, +# distribute copies to the public, perform publicly and display +# publicly, and to permit others to do so. +# +# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT +# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES +# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR +# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY +# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS +# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# +# ************************************************************************ +# @HEADER + +include(${TRIBITS_DEPS_DIR}/CUDA.cmake) + +IF (TPL_ENABLE_CUDA) + GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) + GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) + GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) + TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) +ENDIF() + diff --git a/lib/kokkos/cmake/deps/HWLOC.cmake b/lib/kokkos/cmake/deps/HWLOC.cmake new file mode 100644 index 0000000000..275abd3a5d --- /dev/null +++ b/lib/kokkos/cmake/deps/HWLOC.cmake @@ -0,0 +1,70 @@ +# @HEADER +# ************************************************************************ +# +# Trilinos: An Object-Oriented Solver Framework +# Copyright (2001) Sandia Corporation +# +# +# Copyright (2001) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000, there is a non-exclusive license for use of this +# work by or on behalf of the U.S. Government. Export of this program +# may require a license from the United States Government. +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# NOTICE: The United States Government is granted for itself and others +# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide +# license in this data to reproduce, prepare derivative works, and +# perform publicly and display publicly. Beginning five (5) years from +# July 25, 2001, the United States Government is granted for itself and +# others acting on its behalf a paid-up, nonexclusive, irrevocable +# worldwide license in this data to reproduce, prepare derivative works, +# distribute copies to the public, perform publicly and display +# publicly, and to permit others to do so. +# +# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT +# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES +# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR +# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY +# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS +# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# +# ************************************************************************ +# @HEADER + + +#----------------------------------------------------------------------------- +# Hardware locality detection and control library. +# +# Acquisition information: +# Date checked: November 2011 +# Checked by: H. Carter Edwards <hcedwar AT sandia.gov> +# Source: http://www.open-mpi.org/projects/hwloc/ +# Version: 1.3 +# + +TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC + REQUIRED_HEADERS hwloc.h + REQUIRED_LIBS_NAMES "hwloc" + ) diff --git a/lib/kokkos/cmake/deps/Pthread.cmake b/lib/kokkos/cmake/deps/Pthread.cmake new file mode 100644 index 0000000000..46d0a939ca --- /dev/null +++ b/lib/kokkos/cmake/deps/Pthread.cmake @@ -0,0 +1,83 @@ +# @HEADER +# ************************************************************************ +# +# Trilinos: An Object-Oriented Solver Framework +# Copyright (2001) Sandia Corporation +# +# +# Copyright (2001) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000, there is a non-exclusive license for use of this +# work by or on behalf of the U.S. Government. Export of this program +# may require a license from the United States Government. +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# NOTICE: The United States Government is granted for itself and others +# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide +# license in this data to reproduce, prepare derivative works, and +# perform publicly and display publicly. Beginning five (5) years from +# July 25, 2001, the United States Government is granted for itself and +# others acting on its behalf a paid-up, nonexclusive, irrevocable +# worldwide license in this data to reproduce, prepare derivative works, +# distribute copies to the public, perform publicly and display +# publicly, and to permit others to do so. +# +# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT +# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES +# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR +# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY +# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS +# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# +# ************************************************************************ +# @HEADER + + +SET(USE_THREADS FALSE) + +IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) + # Use CMake's Thread finder since it is a bit smarter in determining + # whether pthreads is already built into the compiler and doesn't need + # a library to link. + FIND_PACKAGE(Threads) + #If Threads found a copy of pthreads make sure it is one of the cases the tribits + #tpl system cannot handle. + IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + SET(USE_THREADS TRUE) + ENDIF() + ENDIF() +ENDIF() + +IF(USE_THREADS) + SET(TPL_Pthread_INCLUDE_DIRS "") + SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + SET(TPL_Pthread_LIBRARY_DIRS "") + TIBITS_CREATE_IMPORTED_TPL_LIBRARY(Pthread) +ELSE() + TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread + REQUIRED_HEADERS pthread.h + REQUIRED_LIBS_NAMES pthread + ) +ENDIF() diff --git a/lib/kokkos/cmake/deps/QTHREAD.cmake b/lib/kokkos/cmake/deps/QTHREAD.cmake new file mode 100644 index 0000000000..994b72b200 --- /dev/null +++ b/lib/kokkos/cmake/deps/QTHREAD.cmake @@ -0,0 +1,70 @@ +# @HEADER +# ************************************************************************ +# +# Trilinos: An Object-Oriented Solver Framework +# Copyright (2001) Sandia Corporation +# +# +# Copyright (2001) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000, there is a non-exclusive license for use of this +# work by or on behalf of the U.S. Government. Export of this program +# may require a license from the United States Government. +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# NOTICE: The United States Government is granted for itself and others +# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide +# license in this data to reproduce, prepare derivative works, and +# perform publicly and display publicly. Beginning five (5) years from +# July 25, 2001, the United States Government is granted for itself and +# others acting on its behalf a paid-up, nonexclusive, irrevocable +# worldwide license in this data to reproduce, prepare derivative works, +# distribute copies to the public, perform publicly and display +# publicly, and to permit others to do so. +# +# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT +# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES +# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR +# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY +# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS +# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# +# ************************************************************************ +# @HEADER + + +#----------------------------------------------------------------------------- +# Hardware locality detection and control library. +# +# Acquisition information: +# Date checked: July 2014 +# Checked by: H. Carter Edwards <hcedwar AT sandia.gov> +# Source: https://code.google.com/p/qthreads +# + +TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD + REQUIRED_HEADERS qthread.h + REQUIRED_LIBS_NAMES "qthread" + ) + diff --git a/lib/kokkos/cmake/tribits.cmake b/lib/kokkos/cmake/tribits.cmake new file mode 100644 index 0000000000..34cd216f81 --- /dev/null +++ b/lib/kokkos/cmake/tribits.cmake @@ -0,0 +1,485 @@ +INCLUDE(CMakeParseArguments) +INCLUDE(CTest) + +FUNCTION(ASSERT_DEFINED VARS) + FOREACH(VAR ${VARS}) + IF(NOT DEFINED ${VAR}) + MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!") + ENDIF() + ENDFOREACH() +ENDFUNCTION() + +MACRO(GLOBAL_SET VARNAME) + SET(${VARNAME} ${ARGN} CACHE INTERNAL "") +ENDMACRO() + +MACRO(PREPEND_GLOBAL_SET VARNAME) + ASSERT_DEFINED(${VARNAME}) + GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) +ENDMACRO() + +FUNCTION(REMOVE_GLOBAL_DUPLICATES VARNAME) + ASSERT_DEFINED(${VARNAME}) + IF (${VARNAME}) + SET(TMP ${${VARNAME}}) + LIST(REMOVE_DUPLICATES TMP) + GLOBAL_SET(${VARNAME} ${TMP}) + ENDIF() +ENDFUNCTION() + +MACRO(TRIBITS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE) + MESSAGE(STATUS "TRIBITS_ADD_OPTION_AND_DEFINE: '${USER_OPTION_NAME}' '${MACRO_DEFINE_NAME}' '${DEFAULT_VALUE}'") + SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" ) + IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "") + IF(${USER_OPTION_NAME}) + GLOBAL_SET(${MACRO_DEFINE_NAME} ON) + ELSE() + GLOBAL_SET(${MACRO_DEFINE_NAME} OFF) + ENDIF() + ENDIF() +ENDMACRO() + +FUNCTION(TRIBITS_CONFIGURE_FILE PACKAGE_NAME_CONFIG_FILE) + + # Configure the file + CONFIGURE_FILE( + ${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in + ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE} + ) + +ENDFUNCTION() + +MACRO(TRIBITS_ADD_DEBUG_OPTION) + TRIBITS_ADD_OPTION_AND_DEFINE( + ${PROJECT_NAME}_ENABLE_DEBUG + HAVE_${PROJECT_NAME_UC}_DEBUG + "Enable a host of runtime debug checking." + OFF + ) +ENDMACRO() + + +MACRO(TRIBITS_ADD_TEST_DIRECTORIES) + FOREACH(TEST_DIR ${ARGN}) + ADD_SUBDIRECTORY(${TEST_DIR}) + ENDFOREACH() +ENDMACRO() + +MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES) + + IF(${PACKAGE_NAME}_ENABLE_EXAMPLES OR ${PARENT_PACKAGE_NAME}_ENABLE_EXAMPLES) + FOREACH(EXAMPLE_DIR ${ARGN}) + ADD_SUBDIRECTORY(${EXAMPLE_DIR}) + ENDFOREACH() + ENDIF() + +ENDMACRO() + +MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT) + SET(PROP_VALUES) + FOREACH(TARGET_X ${ARGN}) + LIST(APPEND PROP_VALUES "$<TARGET_PROPERTY:${TARGET_X},${PROP_IN}>") + ENDFOREACH() + SET_TARGET_PROPERTIES(${TARGET_NAME} PROPERTIES ${PROP_OUT} "${PROP_VALUES}") +ENDMACRO() + +MACRO(ADD_INTERFACE_LIBRARY LIB_NAME) + FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") + ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) + SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE) +ENDMACRO() + +# Older versions of cmake does not make include directories transitive +MACRO(TARGET_LINK_AND_INCLUDE_LIBRARIES TARGET_NAME) + TARGET_LINK_LIBRARIES(${TARGET_NAME} LINK_PUBLIC ${ARGN}) + FOREACH(DEP_LIB ${ARGN}) + TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INTERFACE_INCLUDE_DIRECTORIES>) + TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INCLUDE_DIRECTORIES>) + ENDFOREACH() +ENDMACRO() + +FUNCTION(TRIBITS_ADD_LIBRARY LIBRARY_NAME) + + SET(options STATIC SHARED TESTONLY NO_INSTALL_LIB_OR_HEADERS CUDALIBRARY) + SET(oneValueArgs) + SET(multiValueArgs HEADERS HEADERS_INSTALL_SUBDIR NOINSTALLHEADERS SOURCES DEPLIBS IMPORTEDLIBS DEFINES ADDED_LIB_TARGET_NAME_OUT) + + CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + IF(PARSE_HEADERS) + LIST(REMOVE_DUPLICATES PARSE_HEADERS) + ENDIF() + IF(PARSE_SOURCES) + LIST(REMOVE_DUPLICATES PARSE_SOURCES) + ENDIF() + + # Local variable to hold all of the libraries that will be directly linked + # to this library. + SET(LINK_LIBS ${${PACKAGE_NAME}_DEPS}) + + # Add dependent libraries passed directly in + + IF (PARSE_IMPORTEDLIBS) + LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS}) + ENDIF() + + IF (PARSE_DEPLIBS) + LIST(APPEND LINK_LIBS ${PARSE_DEPLIBS}) + ENDIF() + + # Add the library and all the dependencies + + IF (PARSE_DEFINES) + ADD_DEFINITIONS(${PARSE_DEFINES}) + ENDIF() + + IF (PARSE_STATIC) + SET(STATIC_KEYWORD "STATIC") + ELSE() + SET(STATIC_KEYWORD) + ENDIF() + + IF (PARSE_SHARED) + SET(SHARED_KEYWORD "SHARED") + ELSE() + SET(SHARED_KEYWORD) + ENDIF() + + IF (PARSE_TESTONLY) + SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL") + ELSE() + SET(EXCLUDE_FROM_ALL_KEYWORD) + ENDIF() + IF (NOT PARSE_CUDALIBRARY) + ADD_LIBRARY( + ${LIBRARY_NAME} + ${STATIC_KEYWORD} + ${SHARED_KEYWORD} + ${EXCLUDE_FROM_ALL_KEYWORD} + ${PARSE_HEADERS} + ${PARSE_NOINSTALLHEADERS} + ${PARSE_SOURCES} + ) + ELSE() + CUDA_ADD_LIBRARY( + ${LIBRARY_NAME} + ${PARSE_HEADERS} + ${PARSE_NOINSTALLHEADERS} + ${PARSE_SOURCES} + ) + ENDIF() + + TARGET_LINK_AND_INCLUDE_LIBRARIES(${LIBRARY_NAME} ${LINK_LIBS}) + + IF (NOT PARSE_TESTONLY OR PARSE_NO_INSTALL_LIB_OR_HEADERS) + + INSTALL( + TARGETS ${LIBRARY_NAME} + EXPORT ${PROJECT_NAME} + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + COMPONENT ${PACKAGE_NAME} + ) + + INSTALL( + FILES ${PARSE_HEADERS} + EXPORT ${PROJECT_NAME} + DESTINATION include + COMPONENT ${PACKAGE_NAME} + ) + + INSTALL( + DIRECTORY ${PARSE_HEADERS_INSTALL_SUBDIR} + EXPORT ${PROJECT_NAME} + DESTINATION include + COMPONENT ${PACKAGE_NAME} + ) + + ENDIF() + + IF (NOT PARSE_TESTONLY) + PREPEND_GLOBAL_SET(${PACKAGE_NAME}_LIBS ${LIBRARY_NAME}) + REMOVE_GLOBAL_DUPLICATES(${PACKAGE_NAME}_LIBS) + ENDIF() + +ENDFUNCTION() + +FUNCTION(TRIBITS_ADD_EXECUTABLE EXE_NAME) + + SET(options NOEXEPREFIX NOEXESUFFIX ADD_DIR_TO_NAME INSTALLABLE TESTONLY) + SET(oneValueArgs ADDED_EXE_TARGET_NAME_OUT) + SET(multiValueArgs SOURCES CATEGORIES HOST XHOST HOSTTYPE XHOSTTYPE DIRECTORY TESTONLYLIBS IMPORTEDLIBS DEPLIBS COMM LINKER_LANGUAGE TARGET_DEFINES DEFINES) + + CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + IF (PARSE_TARGET_DEFINES) + TARGET_COMPILE_DEFINITIONS(${EXE_NAME} PUBLIC ${PARSE_TARGET_DEFINES}) + ENDIF() + + SET(LINK_LIBS PACKAGE_${PACKAGE_NAME}) + + IF (PARSE_TESTONLYLIBS) + LIST(APPEND LINK_LIBS ${PARSE_TESTONLYLIBS}) + ENDIF() + + IF (PARSE_IMPORTEDLIBS) + LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS}) + ENDIF() + + SET (EXE_SOURCES) + IF(PARSE_DIRECTORY) + FOREACH( SOURCE_FILE ${PARSE_SOURCES} ) + IF(IS_ABSOLUTE ${SOURCE_FILE}) + SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE}) + ELSE() + SET (EXE_SOURCES ${EXE_SOURCES} ${PARSE_DIRECTORY}/${SOURCE_FILE}) + ENDIF() + ENDFOREACH( ) + ELSE() + FOREACH( SOURCE_FILE ${PARSE_SOURCES} ) + SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE}) + ENDFOREACH( ) + ENDIF() + + SET(EXE_BINARY_NAME ${EXE_NAME}) + IF(DEFINED PACKAGE_NAME AND NOT PARSE_NOEXEPREFIX) + SET(EXE_BINARY_NAME ${PACKAGE_NAME}_${EXE_BINARY_NAME}) + ENDIF() + + IF (PARSE_TESTONLY) + SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL") + ELSE() + SET(EXCLUDE_FROM_ALL_KEYWORD) + ENDIF() + ADD_EXECUTABLE(${EXE_BINARY_NAME} ${EXCLUDE_FROM_ALL_KEYWORD} ${EXE_SOURCES}) + + TARGET_LINK_AND_INCLUDE_LIBRARIES(${EXE_BINARY_NAME} ${LINK_LIBS}) + + IF(PARSE_ADDED_EXE_TARGET_NAME_OUT) + SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${EXE_BINARY_NAME} PARENT_SCOPE) + ENDIF() + + IF(PARSE_INSTALLABLE) + INSTALL( + TARGETS ${EXE_BINARY_NAME} + EXPORT ${PROJECT_NAME} + DESTINATION bin + ) + ENDIF() +ENDFUNCTION() + +ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR}) + +FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME) + + SET(options STANDARD_PASS_OUTPUT WILL_FAIL) + SET(oneValueArgs PASS_REGULAR_EXPRESSION FAIL_REGULAR_EXPRESSION ENVIRONMENT TIMEOUT CATEGORIES ADDED_TESTS_NAMES_OUT ADDED_EXE_TARGET_NAME_OUT) + SET(multiValueArgs) + + CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + TRIBITS_ADD_EXECUTABLE(${EXE_NAME} TESTONLY ADDED_EXE_TARGET_NAME_OUT TEST_NAME ${PARSE_UNPARSED_ARGUMENTS}) + + IF(WIN32) + ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${TEST_NAME}${CMAKE_EXECUTABLE_SUFFIX}) + ELSE() + ADD_TEST(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) + ENDIF() + ADD_DEPENDENCIES(check ${TEST_NAME}) + + IF(PARSE_FAIL_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${PARSE_FAIL_REGULAR_EXPRESSION}) + ENDIF() + + IF(PARSE_PASS_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${PARSE_PASS_REGULAR_EXPRESSION}) + ENDIF() + + IF(PARSE_WILL_FAIL) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${PARSE_WILL_FAIL}) + ENDIF() + + IF(PARSE_ADDED_TESTS_NAMES_OUT) + SET(${PARSE_ADDED_TESTS_NAMES_OUT} ${TEST_NAME} PARENT_SCOPE) + ENDIF() + + IF(PARSE_ADDED_EXE_TARGET_NAME_OUT) + SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${TEST_NAME} PARENT_SCOPE) + ENDIF() + +ENDFUNCTION() + +MACRO(TIBITS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) + ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME}) + TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) + TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) +ENDMACRO() + +FUNCTION(TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) + + SET(options MUST_FIND_ALL_LIBS MUST_FIND_ALL_HEADERS NO_PRINT_ENABLE_SUCCESS_FAIL) + SET(oneValueArgs) + SET(multiValueArgs REQUIRED_HEADERS REQUIRED_LIBS_NAMES) + + CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE) + IF (PARSE_REQUIRED_LIBS_NAMES) + FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) + IF(NOT TPL_${TPL_NAME}_LIBRARIES) + SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + ENDIF() + ENDIF() + IF (PARSE_REQUIRED_HEADERS) + FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) + IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) + SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + ENDIF() + ENDIF() + + + IF (_${TPL_NAME}_ENABLE_SUCCESS) + TIBITS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME}) + ENDIF() + +ENDFUNCTION() + +MACRO(TRIBITS_PROCESS_TPL_DEP_FILE TPL_FILE) + GET_FILENAME_COMPONENT(TPL_NAME ${TPL_FILE} NAME_WE) + INCLUDE("${TPL_FILE}") + IF(TARGET TPL_LIB_${TPL_NAME}) + MESSAGE(STATUS "Found tpl library: ${TPL_NAME}") + SET(TPL_ENABLE_${TPL_NAME} TRUE) + ELSE() + MESSAGE(STATUS "Tpl library not found: ${TPL_NAME}") + SET(TPL_ENABLE_${TPL_NAME} FALSE) + ENDIF() +ENDMACRO() + +MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE) + IF(TYPE STREQUAL "REQUIRED") + SET(REQUIRED TRUE) + ELSE() + SET(REQUIRED FALSE) + ENDIF() + IF(TARGET ${TARGET_NAME}) + PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME}) + ELSE() + IF(REQUIRED) + MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}") + ENDIF() + ENDIF() +ENDMACRO() + +MACRO(TRIBITS_APPEND_PACKAGE_DEPS DEP_LIST TYPE) + FOREACH(DEP ${ARGN}) + PREPEND_GLOBAL_SET(${DEP_LIST} PACKAGE_${DEP}) + ENDFOREACH() +ENDMACRO() + +MACRO(TRIBITS_APPEND_TPLS_DEPS DEP_LIST TYPE) + FOREACH(DEP ${ARGN}) + PREPEND_TARGET_SET(${DEP_LIST} TPL_LIB_${DEP} ${TYPE}) + ENDFOREACH() +ENDMACRO() + +MACRO(TRIBITS_ENABLE_TPLS) + FOREACH(TPL ${ARGN}) + IF(TARGET ${TPL}) + GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} TRUE) + ELSE() + GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} FALSE) + ENDIF() + ENDFOREACH() +ENDMACRO() + +MACRO(TRIBITS_PACKAGE_DEFINE_DEPENDENCIES) + + SET(options) + SET(oneValueArgs) + SET(multiValueArgs + LIB_REQUIRED_PACKAGES + LIB_OPTIONAL_PACKAGES + TEST_REQUIRED_PACKAGES + TEST_OPTIONAL_PACKAGES + LIB_REQUIRED_TPLS + LIB_OPTIONAL_TPLS + TEST_REQUIRED_TPLS + TEST_OPTIONAL_TPLS + REGRESSION_EMAIL_LIST + SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS + ) + CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + GLOBAL_SET(${PACKAGE_NAME}_DEPS "") + TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_PACKAGES}) + TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_PACKAGES}) + TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_TPLS}) + TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_TPLS}) + + GLOBAL_SET(${PACKAGE_NAME}_TEST_DEPS "") + TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_PACKAGES}) + TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_PACKAGES}) + TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_TPLS}) + TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_TPLS}) + + TRIBITS_ENABLE_TPLS(${PARSE_LIB_REQUIRED_TPLS} ${PARSE_LIB_OPTIONAL_TPLS} ${PARSE_TEST_REQUIRED_TPLS} ${PARSE_TEST_OPTIONAL_TPLS}) + +ENDMACRO() + +MACRO(TRIBITS_SUBPACKAGE NAME) + SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME}) + SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME}) + STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC) + + ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME}) + + GLOBAL_SET(${PACKAGE_NAME}_LIBS "") + + INCLUDE(${PACKAGE_SOURCE_DIR}/cmake/Dependencies.cmake) + +ENDMACRO(TRIBITS_SUBPACKAGE) + +MACRO(TRIBITS_SUBPACKAGE_POSTPROCESS) + TARGET_LINK_AND_INCLUDE_LIBRARIES(PACKAGE_${PACKAGE_NAME} ${${PACKAGE_NAME}_LIBS}) +ENDMACRO(TRIBITS_SUBPACKAGE_POSTPROCESS) + +MACRO(TRIBITS_PACKAGE_DECL NAME) + + PROJECT(${NAME}) + STRING(TOUPPER ${PROJECT_NAME} PROJECT_NAME_UC) + SET(PACKAGE_NAME ${PROJECT_NAME}) + STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC) + + SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps") + FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake") + FOREACH(TPL_FILE ${TPLS_FILES}) + TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE}) + ENDFOREACH() + +ENDMACRO() + + +MACRO(TRIBITS_PROCESS_SUBPACKAGES) + FILE(GLOB SUBPACKAGES RELATIVE ${CMAKE_SOURCE_DIR} */cmake/Dependencies.cmake) + FOREACH(SUBPACKAGE ${SUBPACKAGES}) + GET_FILENAME_COMPONENT(SUBPACKAGE_CMAKE ${SUBPACKAGE} DIRECTORY) + GET_FILENAME_COMPONENT(SUBPACKAGE_DIR ${SUBPACKAGE_CMAKE} DIRECTORY) + ADD_SUBDIRECTORY(${SUBPACKAGE_DIR}) + ENDFOREACH() +ENDMACRO(TRIBITS_PROCESS_SUBPACKAGES) + +MACRO(TRIBITS_PACKAGE_DEF) +ENDMACRO(TRIBITS_PACKAGE_DEF) + +MACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES) +ENDMACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES) + +MACRO(TRIBITS_EXCLUDE_FILES) +ENDMACRO(TRIBITS_EXCLUDE_FILES) + +MACRO(TRIBITS_PACKAGE_POSTPROCESS) +ENDMACRO(TRIBITS_PACKAGE_POSTPROCESS) + diff --git a/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt b/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt new file mode 100644 index 0000000000..9f56f2fd48 --- /dev/null +++ b/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt @@ -0,0 +1,153 @@ +// -------------------------------------------------------------------------------- // + +The following steps are for workstations/servers with the SEMS environment installed. + +// -------------------------------------------------------------------------------- // +Summary: + +- Step 1: Rigorous testing of Kokkos' develop branch for each backend (Serial, OpenMP, Threads, Cuda) with all supported compilers. + +- Step 2: Snapshot Kokkos' develop branch into current Trilinos develop branch. + +- Step 3: Build and test Trilinos with combinations of compilers, types, backends. + +- Step 4: Promote Kokkos develop branch to master if the snapshot does not cause any new tests to fail; else track/fix causes of new failures. + +- Step 5: Snapshot Kokkos tagged master branch into Trilinos and push Trilinos. +// -------------------------------------------------------------------------------- // + + +// -------------------------------------------------------------------------------- // + +Step 1: + 1.1. Update kokkos develop branch (NOT a fork) + + (From kokkos directory): + git fetch --all + git checkout develop + git reset --hard origin/develop + + 1.2. Create a testing directory - here the directory is created within the kokkos directory + + mkdir testing + cd testing + + 1.3. Run the test_all_sandia script; various compiler and build-list options can be specified + + ../config/test_all_sandia + + 1.4 Clean repository of untracked files + + cd ../ + git clean -df + +// -------------------------------------------------------------------------------- // + +Step 2: + 2.1 Update Trilinos develop branch + + (From Trilinos directory): + git checkout develop + git fetch --all + git reset --hard origin/develop + git clean -df + + 2.2 Snapshot Kokkos into Trilinos - this requires python/2.7.9 and that both Trilinos and Kokkos be clean - no untracked or modified files + + module load python/2.7.9 + python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages + +// -------------------------------------------------------------------------------- // + +Step 3: + 3.1. Build and test Trilinos with 3 different configurations; a configure-all script is provided in Trilinos and should be modified to test each of the following 3 configurations with appropriate environment variable(s): + + - GCC/4.7.2-OpenMP/Complex + Run tests with the following environment variable: + + export OMP_NUM_THREADS=2 + + + - Intel/15.0.2-Serial/NoComplex + + + - GCC/4.8.4/CUDA/7.5.18-Cuda/Serial/NoComplex + Run tests with the following environment variables: + + export CUDA_LAUNCH_BLOCKING=1 + export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 + + + mkdir Build + cd Build + cp TRILINOS_PATH/sampleScripts/Sandia-SEMS/configure-all ./ + ** Set the path to Trilinos appropriately within the configure-all script ** + source $SEMS_MODULE_ROOT/utils/sems-modules-init.sh kokkos + source configure-all + make -k (-k means "keep going" to get past build errors; -j12 can also be specified to build with 12 threads, for example) + ctest + + 3.2. Compare the failed test output to the test output on the dashboard ( testing.sandia.gov/cdash select Trilinos ); investigate and fix problems if new tests fail after the Kokkos snapshot + +// -------------------------------------------------------------------------------- // + +Step 4: + 4.1. Once all Trilinos tests pass promote Kokkos develop branch to master on Github + + - DO NOT fast-forward the merge!!!! + + (From kokkos directory): + git checkout master + git fetch --all + # Ensure we are on the current origin/master + git reset --hard origin/master + git merge --no-ff origin/develop + + 4.2. Update the tag in kokkos/config/master_history.txt + Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate + Tag format: #.#.## + + # Prepend master_history.txt with + + # tag: #.#.## + # date: mm/dd/yyyy + # master: sha1 + # develop: sha1 + # ----------------------- + + git commit --amend -a + + git tag -a #.#.## + tag: #.#.## + date: mm/dd/yyyy + master: sha1 + develop: sha1 + + git push --follow-tags origin master + +// -------------------------------------------------------------------------------- // + +Step 5: + 5.1. Make sure Trilinos is up-to-date - chances are other changes have been committed since the integration testing process began. If a substantial change has occurred that may be affected by the snapshot the testing procedure may need to be repeated + + (From Trilinos directory): + git checkout develop + git fetch --all + git reset --hard origin/develop + git clean -df + + 5.2. Snapshot Kokkos master branch into Trilinos + + (From kokkos directory): + git fetch --all + git checkout tags/#.#.## + git clean -df + + python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages + + 5.3. Push the updated develop branch of Trilinos to Github - congratulations!!! + + (From Trilinos directory): + git push + +// -------------------------------------------------------------------------------- // diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt new file mode 100644 index 0000000000..f2eb674578 --- /dev/null +++ b/lib/kokkos/config/master_history.txt @@ -0,0 +1,3 @@ +tag: 2.01.00 date: 07:21:2016 master: xxxxxxxx develop: fa6dfcc4 +tag: 2.01.06 date: 09:02:2016 master: 9afaa87f develop: 555f1a3a + diff --git a/lib/kokkos/config/nvcc_wrapper b/lib/kokkos/config/nvcc_wrapper index d583866191..6093cb61bd 100755 --- a/lib/kokkos/config/nvcc_wrapper +++ b/lib/kokkos/config/nvcc_wrapper @@ -1,17 +1,12 @@ #!/bin/bash # # This shell script (nvcc_wrapper) wraps both the host compiler and -# NVCC, if you are building Trilinos with CUDA enabled. The script -# remedies some differences between the interface of NVCC and that of -# the host compiler, in particular for linking. It also means that -# Trilinos doesn't need separate .cu files; it can just use .cpp -# files. +# NVCC, if you are building legacy C or C++ code with CUDA enabled. +# The script remedies some differences between the interface of NVCC +# and that of the host compiler, in particular for linking. +# It also means that a legacy code doesn't need separate .cu files; +# it can just use .cpp files. # -# Hopefully, at some point, NVIDIA may fix NVCC so as to make this -# script obsolete. For now, this script exists and if you want to -# build Trilinos with CUDA enabled, you must use this script as your -# compiler. - # Default settings: change those according to your machine. For # example, you may have have two different wrappers with either icpc # or g++ as their back-end compiler. The defaults can be overwritten @@ -53,6 +48,10 @@ object_files="" # Link objects for the host linker only object_files_xlinker="" +# Shared libraries with version numbers are not handled correctly by NVCC +shared_versioned_libraries_host="" +shared_versioned_libraries="" + # Does the User set the architecture arch_set=0 @@ -76,6 +75,9 @@ first_xcompiler_arg=1 temp_dir=${TMPDIR:-/tmp} +# Check if we have an optimization argument already +optimization_applied=0 + #echo "Arguments: $# $@" while [ $# -gt 0 ] @@ -97,8 +99,17 @@ do *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu) cpp_files="$cpp_files $1" ;; + # Ensure we only have one optimization flag because NVCC doesn't allow muliple + -O*) + if [ $optimization_applied -eq 1 ]; then + echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting." + else + shared_args="$shared_args $1" + optimization_applied=1 + fi + ;; #Handle shared args (valid for both nvcc and the host compiler) - -O*|-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared) + -D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared) shared_args="$shared_args $1" ;; #Handle shared args that have an argument @@ -107,7 +118,7 @@ do shift ;; #Handle known nvcc args - -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage) + -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*) cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument @@ -175,10 +186,15 @@ do object_files_xlinker="$object_files_xlinker -Xlinker $1" ;; #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking - *.so.*|*.dylib) + *.dylib) object_files="$object_files -Xlinker $1" object_files_xlinker="$object_files_xlinker -Xlinker $1" ;; + #Handle shared libraries with *.so.* names which nvcc can't do. + *.so.*) + shared_versioned_libraries_host="$shared_versioned_libraries_host $1" + shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1" + ;; #All other args are sent to the host compiler *) if [ $first_xcompiler_arg -eq 1 ]; then @@ -204,13 +220,13 @@ if [ $arch_set -ne 1 ]; then fi #Compose compilation command -nvcc_command="nvcc $cuda_args $shared_args $xlinker_args" +nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries" if [ $first_xcompiler_arg -eq 0 ]; then nvcc_command="$nvcc_command -Xcompiler $xcompiler_args" fi #Compose host only command -host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args" +host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host" #nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING' if [ $replace_pragma_ident -eq 1 ]; then diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia index add45b77b4..aac036a8f3 100755 --- a/lib/kokkos/config/test_all_sandia +++ b/lib/kokkos/config/test_all_sandia @@ -6,18 +6,132 @@ set -o pipefail +# Determine current machine + +MACHINE="" +HOSTNAME=$(hostname) +if [[ "$HOSTNAME" =~ (white|ride).* ]]; then + MACHINE=white +elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then + MACHINE=bowman +elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name + MACHINE=shepard +elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then + MACHINE=sems +else + echo "Unrecognized machine" >&2 + exit 1 +fi + GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" +IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" +IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" CUDA_WARNING_FLAGS="" -BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base" -CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base" +# Default. Machine specific can override +DEBUG=False +ARGS="" +CUSTOM_BUILD_LIST="" +DRYRUN=False +BUILD_ONLY=False +declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3 +TEST_SCRIPT=False +SKIP_HWLOC=False + +ARCH_FLAG="" + +# +# Machine specific config +# + +if [ "$MACHINE" = "sems" ]; then + source /projects/modulefiles/utils/sems-modules-init.sh + source /projects/modulefiles/utils/kokkos-modules-init.sh + + BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base" + CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + +elif [ "$MACHINE" = "white" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" + IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>" + CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.9.2" + + # Don't do pthread on white + GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/4.9.2 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.3.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + ) + + ARCH_FLAG="--arch=Power8" + NUM_JOBS_TO_RUN_IN_PARALLEL=8 + +elif [ "$MACHINE" = "bowman" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>" + + OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ) + + ARCH_FLAG="--arch=KNL" + NUM_JOBS_TO_RUN_IN_PARALLEL=8 + +elif [ "$MACHINE" = "shepard" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>" + + OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ) + + ARCH_FLAG="--arch=HSW" + NUM_JOBS_TO_RUN_IN_PARALLEL=8 + +else + echo "Unhandled machine $MACHINE" >&2 + exit 1 +fi export OMP_NUM_THREADS=4 @@ -25,23 +139,12 @@ declare -i NUM_RESULTS_TO_KEEP=7 RESULT_ROOT_PREFIX=TestAll -source /projects/modulefiles/utils/sems-modules-init.sh -source /projects/modulefiles/utils/kokkos-modules-init.sh - SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd ) # # Handle arguments # -DEBUG=False -ARGS="" -CUSTOM_BUILD_LIST="" -DRYRUN=False -BUILD_ONLY=False -declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3 -TEST_SCRIPT=False - while [[ $# > 0 ]] do key="$1" @@ -61,6 +164,9 @@ BUILD_ONLY=True --test-script*) TEST_SCRIPT=True ;; +--skip-hwloc*) +SKIP_HWLOC=True +;; --num*) NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}" ;; @@ -73,6 +179,7 @@ echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" echo " Defaults to root repo containing this script" echo "--debug: Run tests in debug. Defaults to False" echo "--test-script: Test this script, not Kokkos" +echo "--skip-hwloc: Do not do hwloc tests" echo "--num=N: Number of jobs to run in parallel " echo "--dry-run: Just print what would be executed" echo "--build-only: Just do builds, don't run anything" @@ -82,21 +189,16 @@ echo " Valid items:" echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" echo "" + echo "ARGS: list of expressions matching compilers to test" -echo " supported compilers" -echo " gcc/4.7.2" -echo " gcc/4.8.4" -echo " gcc/4.9.2" -echo " gcc/5.1.0" -echo " intel/14.0.4" -echo " intel/15.0.2" -echo " intel/16.0.1" -echo " clang/3.5.2" -echo " clang/3.6.1" -echo " cuda/6.5.14" -echo " cuda/7.0.28" -echo " cuda/7.5.18" +echo " supported compilers sems" +for COMPILER_DATA in "${COMPILERS[@]}"; do + ARR=($COMPILER_DATA) + COMPILER=${ARR[0]} + echo " $COMPILER" +done echo "" + echo "Examples:" echo " Run all tests" echo " % test_all_sandia" @@ -147,21 +249,6 @@ if [ -z "$ARGS" ]; then ARGS='?' fi -# Format: (compiler module-list build-list exe-name warning-flag) -COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - # Process args to figure out which compilers to test COMPILERS_TO_TEST="" for ARG in $ARGS; do @@ -240,18 +327,19 @@ run_cmd() { fi } -# report_and_log_test_results <SUCCESS> <DESC> <PHASE> +# report_and_log_test_results <SUCCESS> <DESC> <COMMENT> report_and_log_test_result() { # Use sane var names - local success=$1; local desc=$2; local phase=$3; + local success=$1; local desc=$2; local comment=$3; if [ "$success" = "0" ]; then echo " PASSED $desc" - touch $PASSED_DIR/$desc + echo $comment > $PASSED_DIR/$desc else + # For failures, comment should be the name of the phase that failed echo " FAILED $desc" >&2 - echo $phase > $FAILED_DIR/$desc - cat ${desc}.${phase}.log + echo $comment > $FAILED_DIR/$desc + cat ${desc}.${comment}.log fi } @@ -309,6 +397,8 @@ single_build_and_test() { echo " Starting job $desc" + local comment="no_comment" + if [ "$TEST_SCRIPT" = "True" ]; then local rand=$[ 1 + $[ RANDOM % 10 ]] sleep $rand @@ -316,14 +406,19 @@ single_build_and_test() { run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } fi else - run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + local -i build_start_time=$(date +%s) run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } + local -i build_end_time=$(date +%s) + comment="build_time=$(($build_end_time-$build_start_time))" if [[ "$BUILD_ONLY" == False ]]; then run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } + local -i run_end_time=$(date +%s) + comment="$comment run_time=$(($run_end_time-$build_end_time))" fi fi - report_and_log_test_result 0 $desc + report_and_log_test_result 0 $desc "$comment" return 0 } @@ -374,7 +469,7 @@ build_and_test_all() { run_in_background $compiler $build $BUILD_TYPE # If not cuda, do a hwloc test too - if [[ "$compiler" != cuda* ]]; then + if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then run_in_background $compiler $build "hwloc-$BUILD_TYPE" fi done @@ -401,7 +496,11 @@ wait_summarize_and_exit() { echo "PASSED TESTS" echo "#######################################################" - \ls -1 $PASSED_DIR | sort + local passed_test + for passed_test in $(\ls -1 $PASSED_DIR | sort) + do + echo $passed_test $(cat $PASSED_DIR/$passed_test) + done echo "#######################################################" echo "FAILED TESTS" @@ -409,7 +508,7 @@ wait_summarize_and_exit() { local failed_test local -i rv=0 - for failed_test in $(\ls -1 $FAILED_DIR) + for failed_test in $(\ls -1 $FAILED_DIR | sort) do echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)" rv=$rv+1 diff --git a/lib/kokkos/containers/performance_tests/CMakeLists.txt b/lib/kokkos/containers/performance_tests/CMakeLists.txt index 6b57802935..726d403452 100644 --- a/lib/kokkos/containers/performance_tests/CMakeLists.txt +++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt @@ -16,11 +16,22 @@ IF(Kokkos_ENABLE_OpenMP) LIST( APPEND SOURCES TestOpenMP.cpp) ENDIF() -TRIBITS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest +# Per #374, we always want to build this test, but we only want to run +# it as a PERFORMANCE test. That's why we separate building the test +# from running the test. + +TRIBITS_ADD_EXECUTABLE( + PerfTestExec SOURCES ${SOURCES} COMM serial mpi + TESTONLYLIBS kokkos_gtest + ) + +TRIBITS_ADD_TEST( + PerformanceTest + NAME PerfTestExec + COMM serial mpi NUM_MPI_PROCS 1 + CATEGORIES PERFORMANCE FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest ) diff --git a/lib/kokkos/containers/performance_tests/TestCuda.cpp b/lib/kokkos/containers/performance_tests/TestCuda.cpp index aee262de93..8183adaa60 100644 --- a/lib/kokkos/containers/performance_tests/TestCuda.cpp +++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp @@ -54,6 +54,8 @@ #if defined( KOKKOS_HAVE_CUDA ) +#include <TestDynRankView.hpp> + #include <Kokkos_UnorderedMap.hpp> #include <TestGlobal2LocalIds.hpp> @@ -77,6 +79,13 @@ protected: } }; +TEST_F( cuda, dynrankview_perf ) +{ + std::cout << "Cuda" << std::endl; + std::cout << " DynRankView vs View: Initialization Only " << std::endl; + test_dynrankview_op_perf<Kokkos::Cuda>( 4096 ); +} + TEST_F( cuda, global_2_local) { std::cout << "Cuda" << std::endl; diff --git a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp new file mode 100644 index 0000000000..aab6e6988f --- /dev/null +++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp @@ -0,0 +1,265 @@ + +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_DYNRANKVIEW_HPP +#define KOKKOS_TEST_DYNRANKVIEW_HPP + +#include <Kokkos_Core.hpp> +#include <Kokkos_DynRankView.hpp> +#include <vector> + +#include <impl/Kokkos_Timer.hpp> + +// Compare performance of DynRankView to View, specific focus on the parenthesis operators + +namespace Performance { + +//View functor +template <typename DeviceType> +struct InitViewFunctor { + typedef Kokkos::View<double***, DeviceType> inviewtype; + inviewtype _inview; + + InitViewFunctor( inviewtype &inview_ ) : _inview(inview_) + {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.dimension(1); ++j) { + for (unsigned k = 0; k < _inview.dimension(2); ++k) { + _inview(i,j,k) = i/2 -j*j + k/3; + } + } + } + + struct SumComputationTest + { + typedef Kokkos::View<double***, DeviceType> inviewtype; + inviewtype _inview; + + typedef Kokkos::View<double*, DeviceType> outviewtype; + outviewtype _outview; + + KOKKOS_INLINE_FUNCTION + SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.dimension(1); ++j) { + for (unsigned k = 0; k < _inview.dimension(2); ++k) { + _outview(i) += _inview(i,j,k) ; + } + } + } + }; + +}; + +template <typename DeviceType> +struct InitStrideViewFunctor { + typedef Kokkos::View<double***, Kokkos::LayoutStride, DeviceType> inviewtype; + inviewtype _inview; + + InitStrideViewFunctor( inviewtype &inview_ ) : _inview(inview_) + {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.dimension(1); ++j) { + for (unsigned k = 0; k < _inview.dimension(2); ++k) { + _inview(i,j,k) = i/2 -j*j + k/3; + } + } + } + +}; + +template <typename DeviceType> +struct InitViewRank7Functor { + typedef Kokkos::View<double*******, DeviceType> inviewtype; + inviewtype _inview; + + InitViewRank7Functor( inviewtype &inview_ ) : _inview(inview_) + {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.dimension(1); ++j) { + for (unsigned k = 0; k < _inview.dimension(2); ++k) { + _inview(i,j,k,0,0,0,0) = i/2 -j*j + k/3; + } + } + } + +}; + +//DynRankView functor +template <typename DeviceType> +struct InitDynRankViewFunctor { + typedef Kokkos::DynRankView<double, DeviceType> inviewtype; + inviewtype _inview; + + InitDynRankViewFunctor( inviewtype &inview_ ) : _inview(inview_) + {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.dimension(1); ++j) { + for (unsigned k = 0; k < _inview.dimension(2); ++k) { + _inview(i,j,k) = i/2 -j*j + k/3; + } + } + } + + struct SumComputationTest + { + typedef Kokkos::DynRankView<double, DeviceType> inviewtype; + inviewtype _inview; + + typedef Kokkos::DynRankView<double, DeviceType> outviewtype; + outviewtype _outview; + + KOKKOS_INLINE_FUNCTION + SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.dimension(1); ++j) { + for (unsigned k = 0; k < _inview.dimension(2); ++k) { + _outview(i) += _inview(i,j,k) ; + } + } + } + }; + +}; + + +template <typename DeviceType> +void test_dynrankview_op_perf( const int par_size ) +{ + + typedef DeviceType execution_space; + typedef typename execution_space::size_type size_type; + const size_type dim2 = 900; + const size_type dim3 = 300; + + double elapsed_time_view = 0; + double elapsed_time_compview = 0; + double elapsed_time_strideview = 0; + double elapsed_time_view_rank7 = 0; + double elapsed_time_drview = 0; + double elapsed_time_compdrview = 0; + Kokkos::Timer timer; + { + Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3); + typedef InitViewFunctor<DeviceType> FunctorType; + + timer.reset(); + Kokkos::RangePolicy<DeviceType> policy(0,par_size); + Kokkos::parallel_for( policy , FunctorType(testview) ); + DeviceType::fence(); + elapsed_time_view = timer.seconds(); + std::cout << " View time (init only): " << elapsed_time_view << std::endl; + + + timer.reset(); + Kokkos::View<double*,DeviceType> sumview("sumview",par_size); + Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) ); + DeviceType::fence(); + elapsed_time_compview = timer.seconds(); + std::cout << " View sum computation time: " << elapsed_time_view << std::endl; + + + Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL); + typedef InitStrideViewFunctor<DeviceType> FunctorStrideType; + + timer.reset(); + Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) ); + DeviceType::fence(); + elapsed_time_strideview = timer.seconds(); + std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl; + } + { + Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1); + typedef InitViewRank7Functor<DeviceType> FunctorType; + + timer.reset(); + Kokkos::RangePolicy<DeviceType> policy(0,par_size); + Kokkos::parallel_for( policy , FunctorType(testview) ); + DeviceType::fence(); + elapsed_time_view_rank7 = timer.seconds(); + std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl; + } + { + Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3); + typedef InitDynRankViewFunctor<DeviceType> FunctorType; + + timer.reset(); + Kokkos::RangePolicy<DeviceType> policy(0,par_size); + Kokkos::parallel_for( policy , FunctorType(testdrview) ); + DeviceType::fence(); + elapsed_time_drview = timer.seconds(); + std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl; + + timer.reset(); + Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size); + Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) ); + DeviceType::fence(); + elapsed_time_compdrview = timer.seconds(); + std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl; + + } + + std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1 + std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1 + std::cout << " Ratio of View to View Rank7 time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1 + std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1 + std::cout << " Ratio of DynRankView to View Rank7 time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ? + + timer.reset(); + +} //end test_dynrankview + + +} //end Performance +#endif diff --git a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp index fb70b8fe2e..66f1fbf092 100644 --- a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp +++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp @@ -178,7 +178,7 @@ void test_global_to_local_ids(unsigned num_ids) std::cout << num_ids << ", "; double elasped_time = 0; - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; local_id_view local_2_global("local_ids", num_ids); global_id_view global_2_local((3u*num_ids)/2u); diff --git a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp index 82a9311df7..da74d32ac1 100644 --- a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp +++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp @@ -50,6 +50,8 @@ #include <TestGlobal2LocalIds.hpp> #include <TestUnorderedMapPerformance.hpp> +#include <TestDynRankView.hpp> + #include <iomanip> #include <sstream> #include <string> @@ -91,6 +93,13 @@ protected: } }; +TEST_F( openmp, dynrankview_perf ) +{ + std::cout << "OpenMP" << std::endl; + std::cout << " DynRankView vs View: Initialization Only " << std::endl; + test_dynrankview_op_perf<Kokkos::OpenMP>( 8192 ); +} + TEST_F( openmp, global_2_local) { std::cout << "OpenMP" << std::endl; diff --git a/lib/kokkos/containers/performance_tests/TestThreads.cpp b/lib/kokkos/containers/performance_tests/TestThreads.cpp index 04d9dc0c18..4179b7de4c 100644 --- a/lib/kokkos/containers/performance_tests/TestThreads.cpp +++ b/lib/kokkos/containers/performance_tests/TestThreads.cpp @@ -52,6 +52,8 @@ #include <TestGlobal2LocalIds.hpp> #include <TestUnorderedMapPerformance.hpp> +#include <TestDynRankView.hpp> + #include <iomanip> #include <sstream> #include <string> @@ -85,6 +87,13 @@ protected: } }; +TEST_F( threads, dynrankview_perf ) +{ + std::cout << "Threads" << std::endl; + std::cout << " DynRankView vs View: Initialization Only " << std::endl; + test_dynrankview_op_perf<Kokkos::Threads>( 8192 ); +} + TEST_F( threads, global_2_local) { std::cout << "Threads" << std::endl; diff --git a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp index 975800229c..71d1182cbe 100644 --- a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp +++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp @@ -80,7 +80,7 @@ struct UnorderedMapTest , map(capacity) , histogram(map.get_histogram()) { - Kokkos::Impl::Timer wall_clock ; + Kokkos::Timer wall_clock ; wall_clock.reset(); value_type v = {}; @@ -228,7 +228,7 @@ void run_performance_tests(std::string const & base_file_name) distance_out << "\b\b\b " << std::endl; block_distance_out << "\b\b\b " << std::endl; - Kokkos::Impl::Timer wall_clock ; + Kokkos::Timer wall_clock ; for (int i=0; i < num_collisions ; ++i) { wall_clock.reset(); std::cout << "Collisions: " << collisions[i] << std::endl; diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index 0fc722c140..f72277700a 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -52,6 +52,12 @@ * 2. Max rank of a DynRankView is 7 * 3. subview name is subdynrankview * 4. Every subdynrankview is returned with LayoutStride + * + * NEW: Redesigned DynRankView + * 5. subview function name now available + * 6. Copy and Copy-Assign View to DynRankView + * 7. deep_copy between Views and DynRankViews + * 8. rank( view ); returns the rank of View or DynRankView */ #ifndef KOKKOS_DYNRANKVIEW_HPP @@ -64,11 +70,16 @@ namespace Kokkos { namespace Experimental { +template< typename DataType , class ... Properties > +class DynRankView; //forward declare + namespace Impl { template <typename Specialize> struct DynRankDimTraits { + enum : size_t{unspecified = ~size_t(0)}; + // Compute the rank of the view from the nonzero dimension arguments. KOKKOS_INLINE_FUNCTION static size_t computeRank( const size_t N0 @@ -81,13 +92,13 @@ struct DynRankDimTraits { , const size_t N7 ) { return - ( (N6 == 0 && N5 == 0 && N4 == 0 && N3 == 0 && N2 == 0 && N1 == 0 && N0 == 0) ? 0 - : ( (N6 == 0 && N5 == 0 && N4 == 0 && N3 == 0 && N2 == 0 && N1 == 0) ? 1 - : ( (N6 == 0 && N5 == 0 && N4 == 0 && N3 == 0 && N2 == 0) ? 2 - : ( (N6 == 0 && N5 == 0 && N4 == 0 && N3 == 0) ? 3 - : ( (N6 == 0 && N5 == 0 && N4 == 0) ? 4 - : ( (N6 == 0 && N5 == 0) ? 5 - : ( (N6 == 0) ? 6 + ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified && N1 == unspecified && N0 == unspecified) ? 0 + : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified && N1 == unspecified) ? 1 + : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified) ? 2 + : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified) ? 3 + : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified) ? 4 + : ( (N6 == unspecified && N5 == unspecified) ? 5 + : ( (N6 == unspecified) ? 6 : 7 ) ) ) ) ) ) ); } @@ -112,14 +123,14 @@ struct DynRankDimTraits { KOKKOS_INLINE_FUNCTION static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) , Layout >::type createLayout( const Layout& layout ) { - return Layout( layout.dimension[0] != 0 ? layout.dimension[0] : 1 - , layout.dimension[1] != 0 ? layout.dimension[1] : 1 - , layout.dimension[2] != 0 ? layout.dimension[2] : 1 - , layout.dimension[3] != 0 ? layout.dimension[3] : 1 - , layout.dimension[4] != 0 ? layout.dimension[4] : 1 - , layout.dimension[5] != 0 ? layout.dimension[5] : 1 - , layout.dimension[6] != 0 ? layout.dimension[6] : 1 - , layout.dimension[7] != 0 ? layout.dimension[7] : 1 + return Layout( layout.dimension[0] != unspecified ? layout.dimension[0] : 1 + , layout.dimension[1] != unspecified ? layout.dimension[1] : 1 + , layout.dimension[2] != unspecified ? layout.dimension[2] : 1 + , layout.dimension[3] != unspecified ? layout.dimension[3] : 1 + , layout.dimension[4] != unspecified ? layout.dimension[4] : 1 + , layout.dimension[5] != unspecified ? layout.dimension[5] : 1 + , layout.dimension[6] != unspecified ? layout.dimension[6] : 1 + , layout.dimension[7] != unspecified ? layout.dimension[7] : 1 ); } @@ -128,21 +139,21 @@ struct DynRankDimTraits { KOKKOS_INLINE_FUNCTION static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) , Layout>::type createLayout( const Layout& layout ) { - return Layout( layout.dimension[0] != 0 ? layout.dimension[0] : 1 + return Layout( layout.dimension[0] != unspecified ? layout.dimension[0] : 1 , layout.stride[0] - , layout.dimension[1] != 0 ? layout.dimension[1] : 1 + , layout.dimension[1] != unspecified ? layout.dimension[1] : 1 , layout.stride[1] - , layout.dimension[2] != 0 ? layout.dimension[2] : 1 + , layout.dimension[2] != unspecified ? layout.dimension[2] : 1 , layout.stride[2] - , layout.dimension[3] != 0 ? layout.dimension[3] : 1 + , layout.dimension[3] != unspecified ? layout.dimension[3] : 1 , layout.stride[3] - , layout.dimension[4] != 0 ? layout.dimension[4] : 1 + , layout.dimension[4] != unspecified ? layout.dimension[4] : 1 , layout.stride[4] - , layout.dimension[5] != 0 ? layout.dimension[5] : 1 + , layout.dimension[5] != unspecified ? layout.dimension[5] : 1 , layout.stride[5] - , layout.dimension[6] != 0 ? layout.dimension[6] : 1 + , layout.dimension[6] != unspecified ? layout.dimension[6] : 1 , layout.stride[6] - , layout.dimension[7] != 0 ? layout.dimension[7] : 1 + , layout.dimension[7] != unspecified ? layout.dimension[7] : 1 , layout.stride[7] ); } @@ -161,17 +172,141 @@ struct DynRankDimTraits { , const size_t N7 ) { return ViewType( arg - , N0 != 0 ? N0 : 1 - , N1 != 0 ? N1 : 1 - , N2 != 0 ? N2 : 1 - , N3 != 0 ? N3 : 1 - , N4 != 0 ? N4 : 1 - , N5 != 0 ? N5 : 1 - , N6 != 0 ? N6 : 1 - , N7 != 0 ? N7 : 1 ); + , N0 != unspecified ? N0 : 1 + , N1 != unspecified ? N1 : 1 + , N2 != unspecified ? N2 : 1 + , N3 != unspecified ? N3 : 1 + , N4 != unspecified ? N4 : 1 + , N5 != unspecified ? N5 : 1 + , N6 != unspecified ? N6 : 1 + , N7 != unspecified ? N7 : 1 ); } }; + // Non-strided Layout + template <typename Layout , typename iType> + KOKKOS_INLINE_FUNCTION + static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank ) + { + return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0) + , dynrank > 1 ? layout.dimension[1] : ~size_t(0) + , dynrank > 2 ? layout.dimension[2] : ~size_t(0) + , dynrank > 3 ? layout.dimension[3] : ~size_t(0) + , dynrank > 4 ? layout.dimension[4] : ~size_t(0) + , dynrank > 5 ? layout.dimension[5] : ~size_t(0) + , dynrank > 6 ? layout.dimension[6] : ~size_t(0) + , dynrank > 7 ? layout.dimension[7] : ~size_t(0) + ); + } + + // LayoutStride + template <typename Layout , typename iType> + KOKKOS_INLINE_FUNCTION + static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank ) + { + return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0) + , dynrank > 0 ? layout.stride[0] : (0) + , dynrank > 1 ? layout.dimension[1] : ~size_t(0) + , dynrank > 1 ? layout.stride[1] : (0) + , dynrank > 2 ? layout.dimension[2] : ~size_t(0) + , dynrank > 2 ? layout.stride[2] : (0) + , dynrank > 3 ? layout.dimension[3] : ~size_t(0) + , dynrank > 3 ? layout.stride[3] : (0) + , dynrank > 4 ? layout.dimension[4] : ~size_t(0) + , dynrank > 4 ? layout.stride[4] : (0) + , dynrank > 5 ? layout.dimension[5] : ~size_t(0) + , dynrank > 5 ? layout.stride[5] : (0) + , dynrank > 6 ? layout.dimension[6] : ~size_t(0) + , dynrank > 6 ? layout.stride[6] : (0) + , dynrank > 7 ? layout.dimension[7] : ~size_t(0) + , dynrank > 7 ? layout.stride[7] : (0) + ); + } + + template < typename DynRankViewType , typename iType > + void verify_dynrankview_rank ( iType N , const DynRankViewType &drv ) + { + if ( static_cast<iType>(drv.rank()) > N ) + { + Kokkos::abort( "Need at least rank arguments to the operator()" ); + } + } + + +/** \brief Assign compatible default mappings */ +struct ViewToDynRankViewTag {}; + +template< class DstTraits , class SrcTraits > +class ViewMapping< DstTraits , SrcTraits , + typename std::enable_if<( + std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value + && + std::is_same< typename DstTraits::specialize , void >::value + && + std::is_same< typename SrcTraits::specialize , void >::value + && + ( + std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value + || + ( + ( + std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value || + std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value || + std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value + ) + && + ( + std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value || + std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value || + std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value + ) + ) + ) + ) , ViewToDynRankViewTag >::type > +{ +private: + + enum { is_assignable_value_type = + std::is_same< typename DstTraits::value_type + , typename SrcTraits::value_type >::value || + std::is_same< typename DstTraits::value_type + , typename SrcTraits::const_value_type >::value }; + + enum { is_assignable_layout = + std::is_same< typename DstTraits::array_layout + , typename SrcTraits::array_layout >::value || + std::is_same< typename DstTraits::array_layout + , Kokkos::LayoutStride >::value + }; + +public: + + enum { is_assignable = is_assignable_value_type && + is_assignable_layout }; + + typedef ViewMapping< DstTraits , void > DstType ; + typedef ViewMapping< SrcTraits , void > SrcType ; + + template < typename DT , typename ... DP , typename ST , typename ... SP > + KOKKOS_INLINE_FUNCTION + static void assign( Kokkos::Experimental::DynRankView< DT , DP...> & dst , const Kokkos::View< ST , SP... > & src ) + { + static_assert( is_assignable_value_type + , "View assignment must have same value type or const = non-const" ); + + static_assert( is_assignable_layout + , "View assignment must have compatible layout or have rank <= 1" ); + + // Removed dimension checks... + + typedef typename DstType::offset_type dst_offset_type ; + dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc + dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track ); + dst.m_track.assign( src.m_track , DstTraits::is_managed ); + dst.m_rank = src.Rank ; + } +}; + } //end Impl /* \class DynRankView @@ -185,145 +320,228 @@ struct DynRankDimTraits { * 3. subview name is subdynrankview * 4. Every subdynrankview is returned with LayoutStride * + * NEW: Redesigned DynRankView + * 5. subview function name now available + * 6. Copy and Copy-Assign View to DynRankView + * 7. deep_copy between Views and DynRankViews + * 8. rank( view ); returns the rank of View or DynRankView + * */ +template< class > struct is_dyn_rank_view : public std::false_type {}; + +template< class D, class ... P > +struct is_dyn_rank_view< Kokkos::Experimental::DynRankView<D,P...> > : public std::true_type {}; + + template< typename DataType , class ... Properties > -class DynRankView : private View< DataType*******, Properties... > +class DynRankView : public ViewTraits< DataType , Properties ... > { static_assert( !std::is_array<DataType>::value && !std::is_pointer<DataType>::value , "Cannot template DynRankView with array or pointer datatype - must be pod" ); -public: - using view_type = View< DataType******* , Properties...>; - using reference_type = typename view_type::reference_type; - private: template < class , class ... > friend class DynRankView ; - template< class , class ... > friend class Impl::ViewMapping ; - unsigned m_rank; - -public: - KOKKOS_INLINE_FUNCTION - view_type & DownCast() const { return static_cast< view_type & > (*this); } - KOKKOS_INLINE_FUNCTION - const view_type & ConstDownCast() const { return static_cast< const view_type & > (*this); } +// template < class , class ... > friend class Kokkos::Experimental::View ; //unnecessary now... + template < class , class ... > friend class Impl::ViewMapping ; - typedef ViewTraits< DataType , Properties ... > traits ; +public: + typedef ViewTraits< DataType , Properties ... > drvtraits ; - // Data type traits: - typedef typename traits::data_type data_type; - typedef typename traits::const_data_type const_data_type; - typedef typename traits::non_const_data_type non_const_data_type; + typedef View< DataType******* , Properties...> view_type ; - // Compatible array of trivial type traits: - typedef typename traits::scalar_array_type scalar_array_type ; - typedef typename traits::const_scalar_array_type const_scalar_array_type ; - typedef typename traits::non_const_scalar_array_type non_const_scalar_array_type ; + typedef ViewTraits< DataType******* , Properties ... > traits ; - // Value type traits: - typedef typename traits::value_type value_type ; - typedef typename traits::const_value_type const_value_type ; - typedef typename traits::non_const_value_type non_const_value_type ; - // Mapping traits: - typedef typename traits::array_layout array_layout ; - typedef typename traits::specialize specialize ; +private: + typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ; + typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ; - // Execution space, memory space, memory access traits, and host mirror space: - typedef typename traits::execution_space execution_space ; - typedef typename traits::memory_space memory_space ; - typedef typename traits::device_type device_type ; - typedef typename traits::memory_traits memory_traits ; - typedef typename traits::host_mirror_space host_mirror_space ; + track_type m_track ; + map_type m_map ; + unsigned m_rank; - typedef typename traits::size_type size_type ; +public: + KOKKOS_INLINE_FUNCTION + view_type & DownCast() const { return ( view_type & ) (*this); } + KOKKOS_INLINE_FUNCTION + const view_type & ConstDownCast() const { return (const view_type & ) (*this); } - using view_type::is_hostspace ; - using view_type::is_managed ; - using view_type::is_random_access ; + //Types below - at least the HostMirror requires the value_type, NOT the rank 7 data_type of the traits /** \brief Compatible view of array of scalar types */ - typedef DynRankView< typename traits::scalar_array_type , - typename traits::array_layout , - typename traits::device_type , - typename traits::memory_traits > + typedef DynRankView< typename drvtraits::scalar_array_type , + typename drvtraits::array_layout , + typename drvtraits::device_type , + typename drvtraits::memory_traits > array_type ; /** \brief Compatible view of const data type */ - typedef DynRankView< typename traits::const_data_type , - typename traits::array_layout , - typename traits::device_type , - typename traits::memory_traits > + typedef DynRankView< typename drvtraits::const_data_type , + typename drvtraits::array_layout , + typename drvtraits::device_type , + typename drvtraits::memory_traits > const_type ; /** \brief Compatible view of non-const data type */ - typedef DynRankView< typename traits::non_const_data_type , - typename traits::array_layout , - typename traits::device_type , - typename traits::memory_traits > + typedef DynRankView< typename drvtraits::non_const_data_type , + typename drvtraits::array_layout , + typename drvtraits::device_type , + typename drvtraits::memory_traits > non_const_type ; /** \brief Compatible HostMirror view */ - typedef DynRankView< typename traits::non_const_data_type , - typename traits::array_layout , - typename traits::host_mirror_space > + typedef DynRankView< typename drvtraits::non_const_data_type , + typename drvtraits::array_layout , + typename drvtraits::host_mirror_space > HostMirror ; + //---------------------------------------- // Domain rank and extents +// enum { Rank = map_type::Rank }; //Will be dyn rank of 7 always, keep the enum? + + template< typename iType > + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if< std::is_integral<iType>::value , size_t >::type + extent( const iType & r ) const + { return m_map.extent(r); } + + template< typename iType > + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if< std::is_integral<iType>::value , int >::type + extent_int( const iType & r ) const + { return static_cast<int>(m_map.extent(r)); } + + KOKKOS_INLINE_FUNCTION constexpr + typename traits::array_layout layout() const + { return m_map.layout(); } + + //---------------------------------------- + /* Deprecate all 'dimension' functions in favor of + * ISO/C++ vocabulary 'extent'. + */ + + template< typename iType > + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if< std::is_integral<iType>::value , size_t >::type + dimension( const iType & r ) const { return extent( r ); } + + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return m_map.dimension_0() * + m_map.dimension_1() * + m_map.dimension_2() * + m_map.dimension_3() * + m_map.dimension_4() * + m_map.dimension_5() * + m_map.dimension_6() * + m_map.dimension_7(); } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); } + + template< typename iType > + KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_map.stride(s); } + + //---------------------------------------- + // Range span is the span which contains all members. + + typedef typename map_type::reference_type reference_type ; + typedef typename map_type::pointer_type pointer_type ; + + enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value }; + + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + // Deprecated, use 'span()' instead + KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_map.span_is_contiguous(); } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return m_map.data(); } + + // Deprecated, use 'span_is_contigous()' instead + KOKKOS_INLINE_FUNCTION constexpr bool is_contiguous() const { return m_map.span_is_contiguous(); } + // Deprecated, use 'data()' instead + KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const { return m_map.data(); } + + //---------------------------------------- + // Allow specializations to query their specialized map + KOKKOS_INLINE_FUNCTION - DynRankView() : view_type() , m_rank(0) {} + const Kokkos::Experimental::Impl::ViewMapping< traits , void > & + implementation_map() const { return m_map ; } + + //---------------------------------------- + +private: + + enum { + is_layout_left = std::is_same< typename traits::array_layout + , Kokkos::LayoutLeft >::value , + + is_layout_right = std::is_same< typename traits::array_layout + , Kokkos::LayoutRight >::value , + + is_layout_stride = std::is_same< typename traits::array_layout + , Kokkos::LayoutStride >::value , + + is_default_map = + std::is_same< typename traits::specialize , void >::value && + ( is_layout_left || is_layout_right || is_layout_stride ) + }; + +// Bounds checking macros +#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) + +#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \ + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \ + < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \ + Kokkos::Experimental::Impl::verify_dynrankview_rank ( N , *this ) ; \ + Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ; + +#else + +#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \ + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \ + < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); + +#endif + +public: KOKKOS_INLINE_FUNCTION constexpr unsigned rank() const { return m_rank; } - using view_type::extent; - using view_type::extent_int; - using view_type::layout; - using view_type::dimension; - using view_type::size; - using view_type::stride; - - using pointer_type = typename view_type::pointer_type; - using view_type::reference_type_is_lvalue_reference; - using view_type::span; - using view_type::capacity; - using view_type::span_is_contiguous; - using view_type::data; - using view_type::implementation_map; - - using view_type::is_contiguous; - using view_type::ptr_on_device; - - //Deprecated, remove soon (add for test) - using view_type::dimension_0; - using view_type::dimension_1; - using view_type::dimension_2; - using view_type::dimension_3; - using view_type::dimension_4; - using view_type::dimension_5; - using view_type::dimension_6; - using view_type::dimension_7; - using view_type::stride_0; - using view_type::stride_1; - using view_type::stride_2; - using view_type::stride_3; - using view_type::stride_4; - using view_type::stride_5; - using view_type::stride_6; - using view_type::stride_7; //operators () // Rank 0 KOKKOS_INLINE_FUNCTION reference_type operator()() const - { return view_type::operator()(0,0,0,0,0,0,0); } - + { + KOKKOS_VIEW_OPERATOR_VERIFY( 0 , ( implementation_map() ) ) + return implementation_map().reference(); + //return m_map.reference(0,0,0,0,0,0,0); + } + // Rank 1 // This assumes a contiguous underlying memory (i.e. no padding, no striding...) template< typename iType > KOKKOS_INLINE_FUNCTION - typename std::enable_if< std::is_same<value_type, scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type + typename std::enable_if< std::is_same<typename drvtraits::value_type, typename drvtraits::scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type operator[](const iType & i0) const { return data()[i0]; @@ -333,59 +551,141 @@ public: // AND a Trilinos/Sacado scalar type ) template< typename iType > KOKKOS_INLINE_FUNCTION - typename std::enable_if< !std::is_same<value_type, scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type + typename std::enable_if< !std::is_same<typename drvtraits::value_type, typename drvtraits::scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type operator[](const iType & i0) const { - auto map = implementation_map(); - - const size_t dim_scalar = map.dimension_scalar(); +// auto map = implementation_map(); + const size_t dim_scalar = m_map.dimension_scalar(); const size_t bytes = this->span() / dim_scalar; - typedef Kokkos::View<DataType*, array_layout, device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged | memory_traits::RandomAccess | memory_traits::Atomic> > tmp_view_type; + typedef Kokkos::View<DataType*, typename traits::array_layout, typename traits::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged | traits::memory_traits::RandomAccess | traits::memory_traits::Atomic> > tmp_view_type; tmp_view_type rankone_view(this->data(), bytes, dim_scalar); return rankone_view(i0); } template< typename iType > KOKKOS_INLINE_FUNCTION - reference_type operator()(const iType & i0 ) const - { return view_type::operator()(i0,0,0,0,0,0,0); } + typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type + operator()(const iType & i0 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 1 , ( m_map , i0 ) ) + return m_map.reference(i0); + } + + template< typename iType > + KOKKOS_INLINE_FUNCTION + typename std::enable_if< !(std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type + operator()(const iType & i0 ) const + { + return m_map.reference(i0,0,0,0,0,0,0); + } // Rank 2 template< typename iType0 , typename iType1 > KOKKOS_INLINE_FUNCTION - reference_type operator()(const iType0 & i0 , const iType1 & i1 ) const - { return view_type::operator()(i0,i1,0,0,0,0,0); } + typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value), reference_type>::type + operator()(const iType0 & i0 , const iType1 & i1 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) ) + return m_map.reference(i0,i1); + } + + template< typename iType0 , typename iType1 > + KOKKOS_INLINE_FUNCTION + typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type + operator()(const iType0 & i0 , const iType1 & i1 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) ) + return m_map.reference(i0,i1,0,0,0,0,0); + } // Rank 3 template< typename iType0 , typename iType1 , typename iType2 > KOKKOS_INLINE_FUNCTION - reference_type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const - { return view_type::operator()(i0,i1,i2,0,0,0,0); } + typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value), reference_type>::type + operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) ) + return m_map.reference(i0,i1,i2); + } + + template< typename iType0 , typename iType1 , typename iType2 > + KOKKOS_INLINE_FUNCTION + typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type + operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) ) + return m_map.reference(i0,i1,i2,0,0,0,0); + } // Rank 4 template< typename iType0 , typename iType1 , typename iType2 , typename iType3 > KOKKOS_INLINE_FUNCTION - reference_type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const - { return view_type::operator()(i0,i1,i2,i3,0,0,0); } + typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value), reference_type>::type + operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) ) + return m_map.reference(i0,i1,i2,i3); + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 > + KOKKOS_INLINE_FUNCTION + typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type + operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) ) + return m_map.reference(i0,i1,i2,i3,0,0,0); + } // Rank 5 template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 > KOKKOS_INLINE_FUNCTION - reference_type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const - { return view_type::operator()(i0,i1,i2,i3,i4,0,0); } + typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value), reference_type>::type + operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) ) + return m_map.reference(i0,i1,i2,i3,i4); + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 > + KOKKOS_INLINE_FUNCTION + typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type + operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) ) + return m_map.reference(i0,i1,i2,i3,i4,0,0); + } // Rank 6 template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 > KOKKOS_INLINE_FUNCTION - reference_type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const - { return view_type::operator()(i0,i1,i2,i3,i4,i5,0); } + typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value), reference_type>::type + operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) ) + return m_map.reference(i0,i1,i2,i3,i4,i5); + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 > + KOKKOS_INLINE_FUNCTION + typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type + operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) ) + return m_map.reference(i0,i1,i2,i3,i4,i5,0); + } // Rank 7 template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 , typename iType6 > KOKKOS_INLINE_FUNCTION - reference_type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const - { return view_type::operator()(i0,i1,i2,i3,i4,i5,i6); } + typename std::enable_if< (std::is_integral<iType0>::value && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value && std::is_integral<iType6>::value), reference_type>::type + operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const + { + KOKKOS_VIEW_OPERATOR_VERIFY( 7 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 , i6 ) ) + return m_map.reference(i0,i1,i2,i3,i4,i5,i6); + } + +#undef KOKKOS_VIEW_OPERATOR_VERIFY //---------------------------------------- // Standard constructor, destructor, and assignment operators... @@ -394,46 +694,89 @@ public: ~DynRankView() {} KOKKOS_INLINE_FUNCTION - DynRankView( const DynRankView & ) = default ; + DynRankView() : m_track(), m_map(), m_rank() {} //Default ctor KOKKOS_INLINE_FUNCTION - DynRankView( DynRankView && ) = default ; + DynRankView( const DynRankView & rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ), m_rank(rhs.m_rank) {} KOKKOS_INLINE_FUNCTION - DynRankView & operator = ( const DynRankView & ) = default ; + DynRankView( DynRankView && rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ), m_rank(rhs.m_rank) {} KOKKOS_INLINE_FUNCTION - DynRankView & operator = ( DynRankView && ) = default ; + DynRankView & operator = ( const DynRankView & rhs ) { m_track = rhs.m_track; m_map = rhs.m_map; m_rank = rhs.m_rank; return *this; } + + KOKKOS_INLINE_FUNCTION + DynRankView & operator = ( DynRankView && rhs ) { m_track = rhs.m_track; m_map = rhs.m_map; m_rank = rhs.m_rank; return *this; } //---------------------------------------- // Compatible view copy constructor and assignment // may assign unmanaged from managed. - template< class RT , class ... RP > KOKKOS_INLINE_FUNCTION DynRankView( const DynRankView<RT,RP...> & rhs ) - : view_type( rhs.ConstDownCast() ) + : m_track( rhs.m_track , traits::is_managed ) + , m_map() , m_rank(rhs.m_rank) - {} + { + typedef typename DynRankView<RT,RP...> ::traits SrcTraits ; + typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" ); + Mapping::assign( m_map , rhs.m_map , rhs.m_track ); + } template< class RT , class ... RP > KOKKOS_INLINE_FUNCTION DynRankView & operator = (const DynRankView<RT,RP...> & rhs ) - { - view_type::operator = ( rhs.ConstDownCast() ); - m_rank = rhs.rank(); - return *this; - } + { + typedef typename DynRankView<RT,RP...> ::traits SrcTraits ; + typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" ); + Mapping::assign( m_map , rhs.m_map , rhs.m_track ); + m_track.assign( rhs.m_track , traits::is_managed ); + m_rank = rhs.rank(); + return *this; + } + +// Experimental +// Copy/Assign View to DynRankView + template< class RT , class ... RP > + KOKKOS_INLINE_FUNCTION + DynRankView( const View<RT,RP...> & rhs ) + : m_track() + , m_map() + , m_rank( rhs.Rank ) + { + typedef typename View<RT,RP...>::traits SrcTraits ; + typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ; + static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" ); + Mapping::assign( *this , rhs ); + } + + template< class RT , class ... RP > + KOKKOS_INLINE_FUNCTION + DynRankView & operator = ( const View<RT,RP...> & rhs ) + { + typedef typename View<RT,RP...>::traits SrcTraits ; + typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ; + static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" ); + Mapping::assign( *this , rhs ); + return *this ; + } //---------------------------------------- // Allocation tracking properties - using view_type::use_count; - using view_type::label; + KOKKOS_INLINE_FUNCTION + int use_count() const + { return m_track.use_count(); } + + inline + const std::string label() const + { return m_track.template get_label< typename traits::memory_space >(); } //---------------------------------------- // Allocation according to allocation properties and array layout - + // unused arg_layout dimensions must be set to ~size_t(0) so that rank deduction can properly take place template< class ... P > explicit inline DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop @@ -441,12 +784,77 @@ public: , typename traits::array_layout >::type const & arg_layout ) - : view_type( arg_prop - , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) ) + : m_track() + , m_map() , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) ) - {} + { + // Append layout and spaces if not input + typedef Impl::ViewCtorProp< P ... > alloc_prop_input ; + + // use 'std::integral_constant<unsigned,I>' for non-types + // to avoid duplicate class error. + typedef Impl::ViewCtorProp + < P ... + , typename std::conditional + < alloc_prop_input::has_label + , std::integral_constant<unsigned,0> + , typename std::string + >::type + , typename std::conditional + < alloc_prop_input::has_memory_space + , std::integral_constant<unsigned,1> + , typename traits::device_type::memory_space + >::type + , typename std::conditional + < alloc_prop_input::has_execution_space + , std::integral_constant<unsigned,2> + , typename traits::device_type::execution_space + >::type + > alloc_prop ; + + static_assert( traits::is_managed + , "View allocation constructor requires managed memory" ); + + if ( alloc_prop::initialize && + ! alloc_prop::execution_space::is_initialized() ) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception("Constructing DynRankView and initializing data with uninitialized execution space"); + } + + // Copy the input allocation properties with possibly defaulted properties + alloc_prop prop( arg_prop ); + +//------------------------------------------------------------ +#if defined( KOKKOS_HAVE_CUDA ) + // If allocating in CudaUVMSpace must fence before and after + // the allocation to protect against possible concurrent access + // on the CPU and the GPU. + // Fence using the trait's executon space (which will be Kokkos::Cuda) + // to avoid incomplete type errors from usng Kokkos::Cuda directly. + if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) { + traits::device_type::memory_space::execution_space::fence(); + } +#endif +//------------------------------------------------------------ -//Wrappers + Kokkos::Experimental::Impl::SharedAllocationRecord<> * + record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) ); + +//------------------------------------------------------------ +#if defined( KOKKOS_HAVE_CUDA ) + if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) { + traits::device_type::memory_space::execution_space::fence(); + } +#endif +//------------------------------------------------------------ + + // Setup and initialization complete, start tracking + m_track.assign_allocated_record_to_uninitialized( record ); + } + + + // Wrappers template< class ... P > explicit KOKKOS_INLINE_FUNCTION DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop @@ -454,10 +862,16 @@ public: , typename traits::array_layout >::type const & arg_layout ) - : view_type( arg_prop - , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) ) + : m_track() // No memory tracking + , m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) ) , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) ) - {} + { + static_assert( + std::is_same< pointer_type + , typename Impl::ViewCtorProp< P... >::pointer_type + >::value , + "Constructing DynRankView to wrap user memory must supply matching pointer type" ); + } //---------------------------------------- //Constructor(s) @@ -468,14 +882,14 @@ public: DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer , size_t - >::type const arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 + >::type const arg_N0 = ~size_t(0) + , const size_t arg_N1 = ~size_t(0) + , const size_t arg_N2 = ~size_t(0) + , const size_t arg_N3 = ~size_t(0) + , const size_t arg_N4 = ~size_t(0) + , const size_t arg_N5 = ~size_t(0) + , const size_t arg_N6 = ~size_t(0) + , const size_t arg_N7 = ~size_t(0) ) : DynRankView( arg_prop , typename traits::array_layout @@ -488,14 +902,14 @@ public: DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer , size_t - >::type const arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 + >::type const arg_N0 = ~size_t(0) + , const size_t arg_N1 = ~size_t(0) + , const size_t arg_N2 = ~size_t(0) + , const size_t arg_N3 = ~size_t(0) + , const size_t arg_N4 = ~size_t(0) + , const size_t arg_N5 = ~size_t(0) + , const size_t arg_N6 = ~size_t(0) + , const size_t arg_N7 = ~size_t(0) ) : DynRankView( arg_prop , typename traits::array_layout @@ -514,20 +928,20 @@ public: : DynRankView( Impl::ViewCtorProp< std::string >( arg_label ) , arg_layout ) {} - // Allocate label and layout + // Allocate label and layout, must disambiguate from subview constructor template< typename Label > explicit inline DynRankView( const Label & arg_label , typename std::enable_if< Kokkos::Experimental::Impl::is_view_label<Label>::value , - const size_t >::type arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 + const size_t >::type arg_N0 = ~size_t(0) + , const size_t arg_N1 = ~size_t(0) + , const size_t arg_N2 = ~size_t(0) + , const size_t arg_N3 = ~size_t(0) + , const size_t arg_N4 = ~size_t(0) + , const size_t arg_N5 = ~size_t(0) + , const size_t arg_N6 = ~size_t(0) + , const size_t arg_N7 = ~size_t(0) ) : DynRankView( Impl::ViewCtorProp< std::string >( arg_label ) , typename traits::array_layout @@ -536,44 +950,58 @@ public: {} // For backward compatibility -/* explicit inline DynRankView( const ViewAllocateWithoutInitializing & arg_prop , const typename traits::array_layout & arg_layout ) - : view_type( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing ) - , arg_layout - ) - //, m_rank(arg_N0 == 0 ? 0 : ( arg_N1 == 0 ? 1 : ( arg_N2 == 0 ? 2 : ( arg_N3 == 0 ? 3 : ( arg_N4 == 0 ? 4 : ( arg_N5 == 0 ? 5 : ( arg_N6 == 0 ? 6 : ( arg_N7 == 0 ? 7 : 8 ) ) ) ) ) ) ) ) //how to extract rank? + : DynRankView( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing ) + , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) + ) {} -*/ explicit inline DynRankView( const ViewAllocateWithoutInitializing & arg_prop - , const size_t arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 + , const size_t arg_N0 = ~size_t(0) + , const size_t arg_N1 = ~size_t(0) + , const size_t arg_N2 = ~size_t(0) + , const size_t arg_N3 = ~size_t(0) + , const size_t arg_N4 = ~size_t(0) + , const size_t arg_N5 = ~size_t(0) + , const size_t arg_N6 = ~size_t(0) + , const size_t arg_N7 = ~size_t(0) ) : DynRankView(Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing ), arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7 ) {} - using view_type::memory_span; + //---------------------------------------- + // Memory span required to wrap these dimensions. + static constexpr size_t required_allocation_size( + const size_t arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 + ) + { + return map_type::memory_span( + typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ); + } explicit KOKKOS_INLINE_FUNCTION DynRankView( pointer_type arg_ptr - , const size_t arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 + , const size_t arg_N0 = ~size_t(0) + , const size_t arg_N1 = ~size_t(0) + , const size_t arg_N2 = ~size_t(0) + , const size_t arg_N3 = ~size_t(0) + , const size_t arg_N4 = ~size_t(0) + , const size_t arg_N5 = ~size_t(0) + , const size_t arg_N6 = ~size_t(0) + , const size_t arg_N7 = ~size_t(0) ) : DynRankView( Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7 ) {} @@ -589,39 +1017,76 @@ public: //---------------------------------------- // Shared scratch memory constructor - using view_type::shmem_size; + static inline + size_t shmem_size( const size_t arg_N0 = ~size_t(0) , + const size_t arg_N1 = ~size_t(0) , + const size_t arg_N2 = ~size_t(0) , + const size_t arg_N3 = ~size_t(0) , + const size_t arg_N4 = ~size_t(0) , + const size_t arg_N5 = ~size_t(0) , + const size_t arg_N6 = ~size_t(0) , + const size_t arg_N7 = ~size_t(0) ) + { + const size_t num_passed_args = + ( arg_N0 != ~size_t(0) ) + ( arg_N1 != ~size_t(0) ) + ( arg_N2 != ~size_t(0) ) + + ( arg_N3 != ~size_t(0) ) + ( arg_N4 != ~size_t(0) ) + ( arg_N5 != ~size_t(0) ) + + ( arg_N6 != ~size_t(0) ) + ( arg_N7 != ~size_t(0) ); + + if ( std::is_same<typename traits::specialize , void>::value && num_passed_args != traits::rank_dynamic ) { + Kokkos::abort( "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n" ); + } + {} + + return map_type::memory_span( + typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ); + } + + explicit KOKKOS_INLINE_FUNCTION + DynRankView( const typename traits::execution_space::scratch_memory_space & arg_space + , const typename traits::array_layout & arg_layout ) + : DynRankView( Impl::ViewCtorProp<pointer_type>( + reinterpret_cast<pointer_type>( + arg_space.get_shmem( map_type::memory_span( + Impl::DynRankDimTraits<typename traits::specialize>::createLayout( arg_layout ) //is this correct? + ) ) ) ) + , arg_layout ) + {} explicit KOKKOS_INLINE_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space & arg_space - , const size_t arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 ) - : view_type( Impl::DynRankDimTraits<typename traits::specialize>::template createView<view_type>( arg_space - , arg_N0 - , arg_N1 - , arg_N2 - , arg_N3 - , arg_N4 - , arg_N5 - , arg_N6 - , arg_N7 ) ) - , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank( arg_N0 - , arg_N1 - , arg_N2 - , arg_N3 - , arg_N4 - , arg_N5 - , arg_N6 - , arg_N7 ) ) + , const size_t arg_N0 = ~size_t(0) + , const size_t arg_N1 = ~size_t(0) + , const size_t arg_N2 = ~size_t(0) + , const size_t arg_N3 = ~size_t(0) + , const size_t arg_N4 = ~size_t(0) + , const size_t arg_N5 = ~size_t(0) + , const size_t arg_N6 = ~size_t(0) + , const size_t arg_N7 = ~size_t(0) ) + + : DynRankView( Impl::ViewCtorProp<pointer_type>( + reinterpret_cast<pointer_type>( + arg_space.get_shmem( + map_type::memory_span( + Impl::DynRankDimTraits<typename traits::specialize>::createLayout( + typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) ) + ) + , typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) + ) {} }; + + template < typename D , class ... P > + KOKKOS_INLINE_FUNCTION + constexpr unsigned rank( const DynRankView<D , P...> & DRV ) { return DRV.rank(); } //needed for transition to common constexpr method in view and dynrankview to return rank + //---------------------------------------------------------------------------- // Subview mapping. // Deduce destination view type from source view traits and subview arguments @@ -719,11 +1184,11 @@ public: template < typename T , class ... P > KOKKOS_INLINE_FUNCTION - static ret_type subview( const unsigned src_rank , Kokkos::Experimental::View< T******* , P...> const & src + static ret_type subview( const unsigned src_rank , Kokkos::Experimental::DynRankView< T , P...> const & src , Args ... args ) { - typedef ViewMapping< traits_type, void > DstType ; + typedef ViewMapping< traits_type, void > DstType ; typedef typename std::conditional< (rank==0) , ViewDimension<> , typename std::conditional< (rank==1) , ViewDimension<0> @@ -801,13 +1266,21 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args. typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::Experimental::ViewTraits< D*******, P... > , Args... > metafcn ; - return metafcn::subview( src.rank() , src.ConstDownCast() , args... ); + return metafcn::subview( src.rank() , src , args... ); + } + +//Wrapper to allow subview function name +template< class D , class ... P , class ...Args > +KOKKOS_INLINE_FUNCTION +Subdynrankview< ViewTraits<D******* , P...> , Args... > +subview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args...args) + { + return subdynrankview( src , args... ); } } // namespace Experimental } // namespace Kokkos - namespace Kokkos { namespace Experimental { @@ -854,6 +1327,109 @@ bool operator != ( const DynRankView<LT,LP...> & lhs , //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template< class OutputView , typename Enable = void > +struct DynRankViewFill { + + typedef typename OutputView::traits::const_value_type const_value_type ; + + const OutputView output ; + const_value_type input ; + + KOKKOS_INLINE_FUNCTION + void operator()( const size_t i0 ) const + { + const size_t n1 = output.dimension_1(); + const size_t n2 = output.dimension_2(); + const size_t n3 = output.dimension_3(); + const size_t n4 = output.dimension_4(); + const size_t n5 = output.dimension_5(); + const size_t n6 = output.dimension_6(); + + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) { + for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) { + for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) { + for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) { + output(i0,i1,i2,i3,i4,i5,i6) = input ; + }}}}}} + } + + DynRankViewFill( const OutputView & arg_out , const_value_type & arg_in ) + : output( arg_out ), input( arg_in ) + { + typedef typename OutputView::execution_space execution_space ; + typedef Kokkos::RangePolicy< execution_space > Policy ; + + const Kokkos::Impl::ParallelFor< DynRankViewFill , Policy > closure( *this , Policy( 0 , output.dimension_0() ) ); + + closure.execute(); + + execution_space::fence(); + } +}; + +template< class OutputView > +struct DynRankViewFill< OutputView , typename std::enable_if< OutputView::Rank == 0 >::type > { + DynRankViewFill( const OutputView & dst , const typename OutputView::const_value_type & src ) + { + Kokkos::Impl::DeepCopy< typename OutputView::memory_space , Kokkos::HostSpace > + ( dst.data() , & src , sizeof(typename OutputView::const_value_type) ); + } +}; + +template< class OutputView , class InputView , class ExecSpace = typename OutputView::execution_space > +struct DynRankViewRemap { + + const OutputView output ; + const InputView input ; + const size_t n0 ; + const size_t n1 ; + const size_t n2 ; + const size_t n3 ; + const size_t n4 ; + const size_t n5 ; + const size_t n6 ; + const size_t n7 ; + + DynRankViewRemap( const OutputView & arg_out , const InputView & arg_in ) + : output( arg_out ), input( arg_in ) + , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) ) + , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) ) + , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) ) + , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) ) + , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) ) + , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) ) + , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) ) + , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) ) + { + typedef Kokkos::RangePolicy< ExecSpace > Policy ; + const Kokkos::Impl::ParallelFor< DynRankViewRemap , Policy > closure( *this , Policy( 0 , n0 ) ); + closure.execute(); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_t i0 ) const + { + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) { + for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) { + for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) { + for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) { + output(i0,i1,i2,i3,i4,i5,i6) = input(i0,i1,i2,i3,i4,i5,i6); + }}}}}} + } +}; + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + namespace Kokkos { namespace Experimental { @@ -863,9 +1439,17 @@ template< class DT , class ... DP > inline void deep_copy ( const DynRankView<DT,DP...> & dst - , typename ViewTraits<DT,DP...>::const_value_type & value ) + , typename ViewTraits<DT,DP...>::const_value_type & value + , typename std::enable_if< + std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value + >::type * = 0 ) { - deep_copy( dst.ConstDownCast() , value ); + static_assert( + std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type , + typename ViewTraits<DT,DP...>::value_type >::value + , "deep_copy requires non-const type" ); + + Kokkos::Experimental::Impl::DynRankViewFill< DynRankView<DT,DP...> >( dst , value ); } /** \brief Deep copy into a value in Host memory from a view. */ @@ -873,21 +1457,156 @@ template< class ST , class ... SP > inline void deep_copy ( typename ViewTraits<ST,SP...>::non_const_value_type & dst - , const DynRankView<ST,SP...> & src ) + , const DynRankView<ST,SP...> & src + , typename std::enable_if< + std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value + >::type * = 0 ) { - deep_copy( dst , src.ConstDownCast() ); -} + if ( src.rank() != 0 ) + { + Kokkos::abort(""); + } + typedef ViewTraits<ST,SP...> src_traits ; + typedef typename src_traits::memory_space src_memory_space ; + Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) ); +} //---------------------------------------------------------------------------- -/** \brief A deep copy between views of compatible type */ -template< class DT , class ... DP , class ST , class ... SP > +/** \brief A deep copy between views of the default specialization, compatible type, + * same rank, same contiguous layout. + */ +template< class DstType , class SrcType > inline void deep_copy - ( const DynRankView<DT,DP...> & dst - , const DynRankView<ST,SP...> & src ) + ( const DstType & dst + , const SrcType & src + , typename std::enable_if<( + std::is_same< typename DstType::traits::specialize , void >::value && + std::is_same< typename SrcType::traits::specialize , void >::value + && + ( Kokkos::Experimental::is_dyn_rank_view<DstType>::value || Kokkos::Experimental::is_dyn_rank_view<SrcType>::value) + )>::type * = 0 ) { - deep_copy( dst.ConstDownCast() , src.ConstDownCast() ); + static_assert( + std::is_same< typename DstType::traits::value_type , + typename DstType::traits::non_const_value_type >::value + , "deep_copy requires non-const destination type" ); + + typedef DstType dst_type ; + typedef SrcType src_type ; + + typedef typename dst_type::execution_space dst_execution_space ; + typedef typename src_type::execution_space src_execution_space ; + typedef typename dst_type::memory_space dst_memory_space ; + typedef typename src_type::memory_space src_memory_space ; + + enum { DstExecCanAccessSrc = + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value }; + + enum { SrcExecCanAccessDst = + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value }; + + if ( (void *) dst.data() != (void*) src.data() ) { + + // Concern: If overlapping views then a parallel copy will be erroneous. + // ... + + // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy + if ( rank(src) == 0 && rank(dst) == 0 ) + { + typedef typename dst_type::value_type value_type ; + Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) ); + } + else if ( std::is_same< typename DstType::traits::value_type , + typename SrcType::traits::non_const_value_type >::value && + ( + ( std::is_same< typename DstType::traits::array_layout , + typename SrcType::traits::array_layout >::value + && + ( std::is_same< typename DstType::traits::array_layout , + typename Kokkos::LayoutLeft>::value + || + std::is_same< typename DstType::traits::array_layout , + typename Kokkos::LayoutRight>::value + ) + ) + || + ( + rank(dst) == 1 + && + rank(src) == 1 + ) + ) && + dst.span_is_contiguous() && + src.span_is_contiguous() && + dst.span() == src.span() && + dst.dimension_0() == src.dimension_0() && + dst.dimension_1() == src.dimension_1() && + dst.dimension_2() == src.dimension_2() && + dst.dimension_3() == src.dimension_3() && + dst.dimension_4() == src.dimension_4() && + dst.dimension_5() == src.dimension_5() && + dst.dimension_6() == src.dimension_6() && + dst.dimension_7() == src.dimension_7() ) { + + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + + Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes ); + } + else if ( std::is_same< typename DstType::traits::value_type , + typename SrcType::traits::non_const_value_type >::value && + ( + ( std::is_same< typename DstType::traits::array_layout , + typename SrcType::traits::array_layout >::value + && + std::is_same< typename DstType::traits::array_layout , + typename Kokkos::LayoutStride>::value + ) + || + ( + rank(dst) == 1 + && + rank(src) == 1 + ) + ) && + dst.span_is_contiguous() && + src.span_is_contiguous() && + dst.span() == src.span() && + dst.dimension_0() == src.dimension_0() && + dst.dimension_1() == src.dimension_1() && + dst.dimension_2() == src.dimension_2() && + dst.dimension_3() == src.dimension_3() && + dst.dimension_4() == src.dimension_4() && + dst.dimension_5() == src.dimension_5() && + dst.dimension_6() == src.dimension_6() && + dst.dimension_7() == src.dimension_7() && + dst.stride_0() == src.stride_0() && + dst.stride_1() == src.stride_1() && + dst.stride_2() == src.stride_2() && + dst.stride_3() == src.stride_3() && + dst.stride_4() == src.stride_4() && + dst.stride_5() == src.stride_5() && + dst.stride_6() == src.stride_6() && + dst.stride_7() == src.stride_7() + ) { + + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + + Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes ); + } + else if ( DstExecCanAccessSrc ) { + // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. + Kokkos::Experimental::Impl::DynRankViewRemap< dst_type , src_type >( dst , src ); + } + else if ( SrcExecCanAccessDst ) { + // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. + Kokkos::Experimental::Impl::DynRankViewRemap< dst_type , src_type , src_execution_space >( dst , src ); + } + else { + Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation"); + } + } } } //end Experimental @@ -900,6 +1619,48 @@ void deep_copy namespace Kokkos { namespace Experimental { +namespace Impl { + + +// Deduce Mirror Types +template<class Space, class T, class ... P> +struct MirrorDRViewType { + // The incoming view_type + typedef typename Kokkos::Experimental::DynRankView<T,P...> src_view_type; + // The memory space for the mirror view + typedef typename Space::memory_space memory_space; + // Check whether it is the same memory space + enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value }; + // The array_layout + typedef typename src_view_type::array_layout array_layout; + // The data type (we probably want it non-const since otherwise we can't even deep_copy to it. + typedef typename src_view_type::non_const_data_type data_type; + // The destination view type if it is not the same memory space + typedef Kokkos::Experimental::DynRankView<data_type,array_layout,Space> dest_view_type; + // If it is the same memory_space return the existsing view_type + // This will also keep the unmanaged trait if necessary + typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type; +}; + +template<class Space, class T, class ... P> +struct MirrorDRVType { + // The incoming view_type + typedef typename Kokkos::Experimental::DynRankView<T,P...> src_view_type; + // The memory space for the mirror view + typedef typename Space::memory_space memory_space; + // Check whether it is the same memory space + enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value }; + // The array_layout + typedef typename src_view_type::array_layout array_layout; + // The data type (we probably want it non-const since otherwise we can't even deep_copy to it. + typedef typename src_view_type::non_const_data_type data_type; + // The destination view type if it is not the same memory space + typedef Kokkos::Experimental::DynRankView<data_type,array_layout,Space> view_type; +}; + +} + + template< class T , class ... P > inline typename DynRankView<T,P...>::HostMirror @@ -914,14 +1675,7 @@ create_mirror( const DynRankView<T,P...> & src typedef typename src_type::HostMirror dst_type ; return dst_type( std::string( src.label() ).append("_mirror") - , src.dimension(0) - , src.dimension(1) - , src.dimension(2) - , src.dimension(3) - , src.dimension(4) - , src.dimension(5) - , src.dimension(6) - , src.dimension(7) ); + , Impl::reconstructLayout(src.layout(), src.rank()) ); } @@ -938,27 +1692,15 @@ create_mirror( const DynRankView<T,P...> & src typedef DynRankView<T,P...> src_type ; typedef typename src_type::HostMirror dst_type ; - Kokkos::LayoutStride layout ; - - layout.dimension[0] = src.dimension(0); - layout.dimension[1] = src.dimension(1); - layout.dimension[2] = src.dimension(2); - layout.dimension[3] = src.dimension(3); - layout.dimension[4] = src.dimension(4); - layout.dimension[5] = src.dimension(5); - layout.dimension[6] = src.dimension(6); - layout.dimension[7] = src.dimension(7); - - layout.stride[0] = src.stride(0); - layout.stride[1] = src.stride(1); - layout.stride[2] = src.stride(2); - layout.stride[3] = src.stride(3); - layout.stride[4] = src.stride(4); - layout.stride[5] = src.stride(5); - layout.stride[6] = src.stride(6); - layout.stride[7] = src.stride(7); - - return dst_type( std::string( src.label() ).append("_mirror") , layout ); + return dst_type( std::string( src.label() ).append("_mirror") + , Impl::reconstructLayout(src.layout(), src.rank()) ); +} + + +// Create a mirror in a new space (specialization for different space) +template<class Space, class T, class ... P> +typename Impl::MirrorDRVType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::Experimental::DynRankView<T,P...> & src) { + return typename Impl::MirrorDRVType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) ); } template< class T , class ... P > @@ -997,6 +1739,22 @@ create_mirror_view( const DynRankView<T,P...> & src return Kokkos::Experimental::create_mirror( src ); } +// Create a mirror view in a new space (specialization for same space) +template<class Space, class T, class ... P> +typename Impl::MirrorDRViewType<Space,T,P ...>::view_type +create_mirror_view(const Space& , const Kokkos::Experimental::DynRankView<T,P...> & src + , typename std::enable_if<Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) { + return src; +} + +// Create a mirror view in a new space (specialization for different space) +template<class Space, class T, class ... P> +typename Impl::MirrorDRViewType<Space,T,P ...>::view_type +create_mirror_view(const Space& , const Kokkos::Experimental::DynRankView<T,P...> & src + , typename std::enable_if<!Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) { + return typename Impl::MirrorDRViewType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) ); +} + } //end Experimental } //end Kokkos @@ -1006,27 +1764,26 @@ create_mirror_view( const DynRankView<T,P...> & src namespace Kokkos { namespace Experimental { - /** \brief Resize a view with copying old data to new data at the corresponding indices. */ template< class T , class ... P > inline void resize( DynRankView<T,P...> & v , - const size_t n0 = 0 , - const size_t n1 = 0 , - const size_t n2 = 0 , - const size_t n3 = 0 , - const size_t n4 = 0 , - const size_t n5 = 0 , - const size_t n6 = 0 , - const size_t n7 = 0 ) + const size_t n0 = ~size_t(0) , + const size_t n1 = ~size_t(0) , + const size_t n2 = ~size_t(0) , + const size_t n3 = ~size_t(0) , + const size_t n4 = ~size_t(0) , + const size_t n5 = ~size_t(0) , + const size_t n6 = ~size_t(0) , + const size_t n7 = ~size_t(0) ) { typedef DynRankView<T,P...> drview_type ; static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only resize managed views" ); - drview_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 ); + drview_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6 ); - Kokkos::Experimental::Impl::ViewRemap< drview_type , drview_type >( v_resized , v ); + Kokkos::Experimental::Impl::DynRankViewRemap< drview_type , drview_type >( v_resized, v ); v = v_resized ; } @@ -1035,29 +1792,30 @@ void resize( DynRankView<T,P...> & v , template< class T , class ... P > inline void realloc( DynRankView<T,P...> & v , - const size_t n0 = 0 , - const size_t n1 = 0 , - const size_t n2 = 0 , - const size_t n3 = 0 , - const size_t n4 = 0 , - const size_t n5 = 0 , - const size_t n6 = 0 , - const size_t n7 = 0 ) + const size_t n0 = ~size_t(0) , + const size_t n1 = ~size_t(0) , + const size_t n2 = ~size_t(0) , + const size_t n3 = ~size_t(0) , + const size_t n4 = ~size_t(0) , + const size_t n5 = ~size_t(0) , + const size_t n6 = ~size_t(0) , + const size_t n7 = ~size_t(0) ) { - typedef DynRankView<T,P...> view_type ; + typedef DynRankView<T,P...> drview_type ; static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" ); const std::string label = v.label(); - v = view_type(); // Deallocate first, if the only view to allocation - v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 ); + v = drview_type(); // Deallocate first, if the only view to allocation + v = drview_type( label, n0, n1, n2, n3, n4, n5, n6 ); } } //end Experimental } //end Kokkos +using Kokkos::Experimental::is_dyn_rank_view ; namespace Kokkos { @@ -1068,6 +1826,7 @@ using Kokkos::Experimental::deep_copy ; using Kokkos::Experimental::create_mirror ; using Kokkos::Experimental::create_mirror_view ; using Kokkos::Experimental::subdynrankview ; +using Kokkos::Experimental::subview ; using Kokkos::Experimental::resize ; using Kokkos::Experimental::realloc ; diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index b1f9e95ed0..fb364f0bf2 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -77,10 +77,7 @@ private: public: - typedef Kokkos::Experimental::MemoryPool - < typename traits::memory_space - , typename traits::execution_space - > memory_pool ; + typedef Kokkos::Experimental::MemoryPool< typename traits::device_type > memory_pool ; private: @@ -338,7 +335,7 @@ public: void operator()( unsigned i ) const { if ( m_destroy && i < m_chunk_max && 0 != m_chunks[i] ) { - m_pool.deallocate( m_chunks[i] , m_pool.get_min_chunk_size() ); + m_pool.deallocate( m_chunks[i] , m_pool.get_min_block_size() ); } m_chunks[i] = 0 ; } @@ -397,7 +394,7 @@ public: // The memory pool chunk is guaranteed to be a power of two , m_chunk_shift( Kokkos::Impl::integral_power_of_two( - m_pool.get_min_chunk_size()/sizeof(typename traits::value_type)) ) + m_pool.get_min_block_size()/sizeof(typename traits::value_type)) ) , m_chunk_mask( ( 1 << m_chunk_shift ) - 1 ) , m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift ) { diff --git a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp index 7de290e711..df2fbed5a6 100644 --- a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp +++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -45,6 +45,7 @@ #define KOKKOS_BITSET_IMPL_HPP #include <Kokkos_Macros.hpp> +#include <impl/Kokkos_BitOps.hpp> #include <stdint.h> #include <cstdio> @@ -52,122 +53,57 @@ #include <iostream> #include <iomanip> -namespace Kokkos { namespace Impl { +namespace Kokkos { +namespace Impl { KOKKOS_FORCEINLINE_FUNCTION -unsigned rotate_right(unsigned i, int r) +unsigned rotate_right( unsigned i, int r ) { - enum { size = static_cast<int>(sizeof(unsigned)*CHAR_BIT) }; - return r ? ((i >> r) | (i << (size-r))) : i ; + enum { size = static_cast<int>( sizeof(unsigned) * CHAR_BIT ) }; + return r ? ( ( i >> r ) | ( i << ( size - r ) ) ) : i ; } -KOKKOS_FORCEINLINE_FUNCTION -int bit_scan_forward(unsigned i) -{ -#if defined( __CUDA_ARCH__ ) - return __ffs(i) - 1; -#elif defined( __GNUC__ ) || defined( __GNUG__ ) - return __builtin_ffs(i) - 1; -#elif defined( __INTEL_COMPILER ) - return _bit_scan_forward(i); -#else - - unsigned t = 1u; - int r = 0; - while (i && (i & t == 0)) - { - t = t << 1; - ++r; - } - return r; -#endif -} - - -KOKKOS_FORCEINLINE_FUNCTION -int bit_scan_reverse(unsigned i) -{ - enum { shift = static_cast<int>(sizeof(unsigned)*CHAR_BIT - 1) }; -#if defined( __CUDA_ARCH__ ) - return shift - __clz(i); -#elif defined( __GNUC__ ) || defined( __GNUG__ ) - return shift - __builtin_clz(i); -#elif defined( __INTEL_COMPILER ) - return _bit_scan_reverse(i); -#else - unsigned t = 1u << shift; - int r = 0; - while (i && (i & t == 0)) - { - t = t >> 1; - ++r; - } - return r; -#endif -} - - -// count the bits set -KOKKOS_FORCEINLINE_FUNCTION -int popcount(unsigned i) -{ -#if defined( __CUDA_ARCH__ ) - return __popc(i); -#elif defined( __GNUC__ ) || defined( __GNUG__ ) - return __builtin_popcount(i); -#elif defined ( __INTEL_COMPILER ) - return _popcnt32(i); -#else - // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive - i = i - ((i >> 1) & ~0u/3u); // temp - i = (i & ~0u/15u*3u) + ((i >> 2) & ~0u/15u*3u); // temp - i = (i + (i >> 4)) & ~0u/255u*15u; // temp - return (int)((i * (~0u/255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); // count -#endif -} - - -template <typename Bitset> +template < typename Bitset > struct BitsetCount { - typedef Bitset bitset_type; - typedef typename bitset_type::execution_space::execution_space execution_space; - typedef typename bitset_type::size_type size_type; - typedef size_type value_type; + typedef Bitset bitset_type; + typedef typename bitset_type::execution_space::execution_space execution_space; + typedef typename bitset_type::size_type size_type; + typedef size_type value_type; bitset_type m_bitset; - BitsetCount( bitset_type const& bitset) + BitsetCount( bitset_type const& bitset ) : m_bitset(bitset) {} size_type apply() const { size_type count = 0u; - parallel_reduce(m_bitset.m_blocks.dimension_0(), *this, count); + parallel_reduce( m_bitset.m_blocks.dimension_0(), *this, count ); return count; } KOKKOS_INLINE_FUNCTION - static void init( value_type & count) + void init( value_type & count ) const { count = 0u; } KOKKOS_INLINE_FUNCTION - static void join( volatile value_type & count, const volatile size_type & incr ) + void join( volatile value_type & count, const volatile size_type & incr ) const { count += incr; } KOKKOS_INLINE_FUNCTION - void operator()( size_type i, value_type & count) const + void operator()( size_type i, value_type & count ) const { - count += popcount(m_bitset.m_blocks[i]); + count += bit_count( m_bitset.m_blocks[i] ); } }; -}} //Kokkos::Impl +} // namespace Impl +} // namespace Kokkos #endif // KOKKOS_BITSET_IMPL_HPP - diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp index 3c4aed7bec..e71ccc0091 100644 --- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -713,13 +713,20 @@ public: typedef Kokkos::Experimental::DynRankView< const T , device > const_dView0 ; typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ; - typedef typename dView0::host_mirror_space host ; + typedef typename dView0::host_mirror_space host_drv_space ; + + typedef Kokkos::Experimental::View< T , device > View0 ; + typedef Kokkos::Experimental::View< T* , device > View1 ; + typedef Kokkos::Experimental::View< T******* , device > View7 ; + + typedef typename View0::host_mirror_space host_view_space ; TestDynViewAPI() { + run_test_resize_realloc(); run_test_mirror(); - run_test(); run_test_scalar(); + run_test(); run_test_const(); run_test_subview(); run_test_subview_strided(); @@ -735,19 +742,147 @@ public: TestViewOperator_LeftAndRight< int , device , 1 >::testit(2); } + static void run_test_resize_realloc() + { + dView0 drv0("drv0", 10, 20, 30); + ASSERT_EQ( drv0.rank(), 3); + + Kokkos::Experimental::resize(drv0, 5, 10); + ASSERT_EQ( drv0.rank(), 2); + ASSERT_EQ( drv0.dimension_0(), 5); + ASSERT_EQ( drv0.dimension_1(), 10); + ASSERT_EQ( drv0.dimension_2(), 1); + + Kokkos::Experimental::realloc(drv0, 10, 20); + ASSERT_EQ( drv0.rank(), 2); + ASSERT_EQ( drv0.dimension_0(), 10); + ASSERT_EQ( drv0.dimension_1(), 20); + ASSERT_EQ( drv0.dimension_2(), 1); + + } + static void run_test_mirror() { - typedef Kokkos::Experimental::DynRankView< int , host > view_type ; + typedef Kokkos::Experimental::DynRankView< int , host_drv_space > view_type ; typedef typename view_type::HostMirror mirror_type ; view_type a("a"); mirror_type am = Kokkos::Experimental::create_mirror_view(a); mirror_type ax = Kokkos::Experimental::create_mirror(a); ASSERT_EQ( & a() , & am() ); + ASSERT_EQ( a.rank() , am.rank() ); + ASSERT_EQ( ax.rank() , am.rank() ); + + if (Kokkos::HostSpace::execution_space::is_initialized() ) + { + Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000); + auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h); + auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h); + + int equal_ptr_h_h2 = (a_h.data() ==a_h2.data())?1:0; + int equal_ptr_h_d = (a_h.data() ==a_d. data())?1:0; + int equal_ptr_h2_d = (a_h2.data()==a_d. data())?1:0; + + ASSERT_EQ(equal_ptr_h_h2,0); + ASSERT_EQ(equal_ptr_h_d ,0); + ASSERT_EQ(equal_ptr_h2_d,0); + + ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0()); + ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0()); + + ASSERT_EQ(a_h.rank(),a_h2.rank()); + ASSERT_EQ(a_h.rank(),a_d.rank()); + } + if (Kokkos::HostSpace::execution_space::is_initialized() ) + { + Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000); + auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h); + auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h); + + int equal_ptr_h_h2 = (a_h.data() ==a_h2.data())?1:0; + int equal_ptr_h_d = (a_h.data() ==a_d. data())?1:0; + int equal_ptr_h2_d = (a_h2.data()==a_d. data())?1:0; + + ASSERT_EQ(equal_ptr_h_h2,0); + ASSERT_EQ(equal_ptr_h_d ,0); + ASSERT_EQ(equal_ptr_h2_d,0); + + ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0()); + ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0()); + + ASSERT_EQ(a_h.rank(),a_h2.rank()); + ASSERT_EQ(a_h.rank(),a_d.rank()); + } + + if (Kokkos::HostSpace::execution_space::is_initialized() ) + { + Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000); + auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h); + auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h); + + int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0; + int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0; + int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0; + + int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; + ASSERT_EQ(equal_ptr_h_h2,1); + ASSERT_EQ(equal_ptr_h_d ,is_same_memspace); + ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace); + + ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0()); + ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0()); + + ASSERT_EQ(a_h.rank(),a_h2.rank()); + ASSERT_EQ(a_h.rank(),a_d.rank()); + } + if (Kokkos::HostSpace::execution_space::is_initialized() ) + { + Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000); + auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h); + auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h); + + int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0; + int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0; + int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0; + + int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; + ASSERT_EQ(equal_ptr_h_h2,1); + ASSERT_EQ(equal_ptr_h_d ,is_same_memspace); + ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace); + + ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0()); + ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0()); + + ASSERT_EQ(a_h.rank(),a_h2.rank()); + ASSERT_EQ(a_h.rank(),a_d.rank()); + } + if (Kokkos::HostSpace::execution_space::is_initialized() ) + { + typedef Kokkos::DynRankView< int , Kokkos::LayoutStride , Kokkos::HostSpace > view_stride_type ; + unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent + view_stride_type a_h( "a" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) ); + auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h); + auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h); + + int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0; + int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0; + int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0; + + int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; + ASSERT_EQ(equal_ptr_h_h2,1); + ASSERT_EQ(equal_ptr_h_d ,is_same_memspace); + ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace); + + ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0()); + ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0()); + + ASSERT_EQ(a_h.rank(),a_h2.rank()); + ASSERT_EQ(a_h.rank(),a_d.rank()); + } } static void run_test_scalar() { - typedef typename dView0::HostMirror hView0 ; + typedef typename dView0::HostMirror hView0 ; //HostMirror of DynRankView is a DynRankView dView0 dx , dy ; hView0 hx , hy ; @@ -765,6 +900,79 @@ public: Kokkos::Experimental::deep_copy( hy , dy ); ASSERT_EQ( hx(), hy() ); + ASSERT_EQ( dx.rank() , hx.rank() ); + ASSERT_EQ( dy.rank() , hy.rank() ); + + //View - DynRankView Interoperability tests + // deep_copy DynRankView to View + View0 vx("vx"); + Kokkos::deep_copy( vx , dx ); + ASSERT_EQ( rank(dx) , rank(vx) ); + + View0 vy("vy"); + Kokkos::deep_copy( vy , dy ); + ASSERT_EQ( rank(dy) , rank(vy) ); + + // deep_copy View to DynRankView + dView0 dxx("dxx"); + Kokkos::deep_copy( dxx , vx ); + ASSERT_EQ( rank(dxx) , rank(vx) ); + + + View7 vcast = dx.ConstDownCast(); + ASSERT_EQ( dx.dimension_0() , vcast.dimension_0() ); + ASSERT_EQ( dx.dimension_1() , vcast.dimension_1() ); + ASSERT_EQ( dx.dimension_2() , vcast.dimension_2() ); + ASSERT_EQ( dx.dimension_3() , vcast.dimension_3() ); + ASSERT_EQ( dx.dimension_4() , vcast.dimension_4() ); + + View7 vcast1( dy.ConstDownCast() ); + ASSERT_EQ( dy.dimension_0() , vcast1.dimension_0() ); + ASSERT_EQ( dy.dimension_1() , vcast1.dimension_1() ); + ASSERT_EQ( dy.dimension_2() , vcast1.dimension_2() ); + ASSERT_EQ( dy.dimension_3() , vcast1.dimension_3() ); + ASSERT_EQ( dy.dimension_4() , vcast1.dimension_4() ); + + //View - DynRankView Interoperability tests + // copy View to DynRankView + dView0 dfromvx( vx ); + auto hmx = Kokkos::create_mirror_view(dfromvx) ; + Kokkos::deep_copy(hmx , dfromvx); + auto hvx = Kokkos::create_mirror_view(vx) ; + Kokkos::deep_copy(hvx , vx); + ASSERT_EQ( rank(hvx) , rank(hmx) ); + ASSERT_EQ( hvx.dimension_0() , hmx.dimension_0() ); + ASSERT_EQ( hvx.dimension_1() , hmx.dimension_1() ); + + // copy-assign View to DynRankView + dView0 dfromvy = vy ; + auto hmy = Kokkos::create_mirror_view(dfromvy) ; + Kokkos::deep_copy(hmy , dfromvy); + auto hvy = Kokkos::create_mirror_view(vy) ; + Kokkos::deep_copy(hvy , vy); + ASSERT_EQ( rank(hvy) , rank(hmy) ); + ASSERT_EQ( hvy.dimension_0() , hmy.dimension_0() ); + ASSERT_EQ( hvy.dimension_1() , hmy.dimension_1() ); + + + View7 vtest1("vtest1",2,2,2,2,2,2,2); + dView0 dfromv1( vtest1 ); + ASSERT_EQ( dfromv1.rank() , vtest1.Rank ); + ASSERT_EQ( dfromv1.dimension_0() , vtest1.dimension_0() ); + ASSERT_EQ( dfromv1.dimension_1() , vtest1.dimension_1() ); + ASSERT_EQ( dfromv1.use_count() , vtest1.use_count() ); + + dView0 dfromv2( vcast ); + ASSERT_EQ( dfromv2.rank() , vcast.Rank ); + ASSERT_EQ( dfromv2.dimension_0() , vcast.dimension_0() ); + ASSERT_EQ( dfromv2.dimension_1() , vcast.dimension_1() ); + ASSERT_EQ( dfromv2.use_count() , vcast.use_count() ); + + dView0 dfromv3 = vcast1; + ASSERT_EQ( dfromv3.rank() , vcast1.Rank ); + ASSERT_EQ( dfromv3.dimension_0() , vcast1.dimension_0() ); + ASSERT_EQ( dfromv3.dimension_1() , vcast1.dimension_1() ); + ASSERT_EQ( dfromv3.use_count() , vcast1.use_count() ); } static void run_test() @@ -782,22 +990,32 @@ public: (void) thing; } + dView0 d_uninitialized(Kokkos::ViewAllocateWithoutInitializing("uninit"),10,20); + ASSERT_TRUE( d_uninitialized.data() != nullptr ); + ASSERT_EQ( d_uninitialized.rank() , 2 ); + ASSERT_EQ( d_uninitialized.dimension_0() , 10 ); + ASSERT_EQ( d_uninitialized.dimension_1() , 20 ); + ASSERT_EQ( d_uninitialized.dimension_2() , 1 ); + dView0 dx , dy , dz ; hView0 hx , hy , hz ; - ASSERT_TRUE( dx.ptr_on_device() == 0 ); - ASSERT_TRUE( dy.ptr_on_device() == 0 ); - ASSERT_TRUE( dz.ptr_on_device() == 0 ); + ASSERT_TRUE( Kokkos::Experimental::is_dyn_rank_view<dView0>::value ); + ASSERT_FALSE( Kokkos::Experimental::is_dyn_rank_view< Kokkos::View<double> >::value ); + + ASSERT_TRUE( dx.ptr_on_device() == 0 ); //Okay with UVM + ASSERT_TRUE( dy.ptr_on_device() == 0 ); //Okay with UVM + ASSERT_TRUE( dz.ptr_on_device() == 0 ); //Okay with UVM ASSERT_TRUE( hx.ptr_on_device() == 0 ); ASSERT_TRUE( hy.ptr_on_device() == 0 ); ASSERT_TRUE( hz.ptr_on_device() == 0 ); - ASSERT_EQ( dx.dimension_0() , 0u ); - ASSERT_EQ( dy.dimension_0() , 0u ); - ASSERT_EQ( dz.dimension_0() , 0u ); + ASSERT_EQ( dx.dimension_0() , 0u ); //Okay with UVM + ASSERT_EQ( dy.dimension_0() , 0u ); //Okay with UVM + ASSERT_EQ( dz.dimension_0() , 0u ); //Okay with UVM ASSERT_EQ( hx.dimension_0() , 0u ); ASSERT_EQ( hy.dimension_0() , 0u ); ASSERT_EQ( hz.dimension_0() , 0u ); - ASSERT_EQ( dx.rank() , 0u ); + ASSERT_EQ( dx.rank() , 0u ); //Okay with UVM ASSERT_EQ( hx.rank() , 0u ); dx = dView0( "dx" , N1 , N2 , N3 ); @@ -806,11 +1024,11 @@ public: hx = hView0( "hx" , N1 , N2 , N3 ); hy = hView0( "hy" , N1 , N2 , N3 ); - ASSERT_EQ( dx.dimension_0() , unsigned(N1) ); - ASSERT_EQ( dy.dimension_0() , unsigned(N1) ); + ASSERT_EQ( dx.dimension_0() , unsigned(N1) ); //Okay with UVM + ASSERT_EQ( dy.dimension_0() , unsigned(N1) ); //Okay with UVM ASSERT_EQ( hx.dimension_0() , unsigned(N1) ); ASSERT_EQ( hy.dimension_0() , unsigned(N1) ); - ASSERT_EQ( dx.rank() , 3 ); + ASSERT_EQ( dx.rank() , 3 ); //Okay with UVM ASSERT_EQ( hx.rank() , 3 ); dx = dView0( "dx" , N0 , N1 , N2 , N3 ); @@ -823,19 +1041,23 @@ public: ASSERT_EQ( hx.dimension_0() , unsigned(N0) ); ASSERT_EQ( hy.dimension_0() , unsigned(N0) ); ASSERT_EQ( dx.rank() , 4 ); + ASSERT_EQ( dy.rank() , 4 ); ASSERT_EQ( hx.rank() , 4 ); + ASSERT_EQ( hy.rank() , 4 ); ASSERT_EQ( dx.use_count() , size_t(1) ); dView0_unmanaged unmanaged_dx = dx; ASSERT_EQ( dx.use_count() , size_t(1) ); + dView0_unmanaged unmanaged_from_ptr_dx = dView0_unmanaged(dx.ptr_on_device(), dx.dimension_0(), dx.dimension_1(), dx.dimension_2(), dx.dimension_3()); + { // Destruction of this view should be harmless const_dView0 unmanaged_from_ptr_const_dx( dx.ptr_on_device() , @@ -888,6 +1110,19 @@ public: hx = Kokkos::Experimental::create_mirror( dx ); hy = Kokkos::Experimental::create_mirror( dy ); + ASSERT_EQ( hx.rank() , dx.rank() ); + ASSERT_EQ( hy.rank() , dy.rank() ); + + ASSERT_EQ( hx.dimension_0() , unsigned(N0) ); + ASSERT_EQ( hx.dimension_1() , unsigned(N1) ); + ASSERT_EQ( hx.dimension_2() , unsigned(N2) ); + ASSERT_EQ( hx.dimension_3() , unsigned(N3) ); + + ASSERT_EQ( hy.dimension_0() , unsigned(N0) ); + ASSERT_EQ( hy.dimension_1() , unsigned(N1) ); + ASSERT_EQ( hy.dimension_2() , unsigned(N2) ); + ASSERT_EQ( hy.dimension_3() , unsigned(N3) ); + // T v1 = hx() ; // Generates compile error as intended // T v2 = hx(0,0) ; // Generates compile error as intended // hx(0,0) = v2 ; // Generates compile error as intended @@ -990,7 +1225,9 @@ public: for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); } }}}} +// ASSERT_EQ( hx(0,0,0,0,0,0,0,0) , T(0) ); //Test rank8 op behaves properly - if implemented } + dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz); dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz); @@ -1006,6 +1243,35 @@ public: ASSERT_TRUE( dx.ptr_on_device() == 0 ); ASSERT_TRUE( dy.ptr_on_device() == 0 ); ASSERT_TRUE( dz.ptr_on_device() == 0 ); + + //View - DynRankView Interoperability tests + // deep_copy from view to dynrankview + const int testdim = 4; + dView0 dxx("dxx",testdim); + View1 vxx("vxx",testdim); + auto hvxx = Kokkos::create_mirror_view(vxx); + for (int i = 0; i < testdim; ++i) + { hvxx(i) = i; } + Kokkos::deep_copy(vxx,hvxx); + Kokkos::deep_copy(dxx,vxx); + auto hdxx = Kokkos::create_mirror_view(dxx); + Kokkos::deep_copy(hdxx,dxx); + for (int i = 0; i < testdim; ++i) + { ASSERT_EQ( hvxx(i) , hdxx(i) ); } + + ASSERT_EQ( rank(hdxx) , rank(hvxx) ); + ASSERT_EQ( hdxx.dimension_0() , testdim ); + ASSERT_EQ( hdxx.dimension_0() , hvxx.dimension_0() ); + + // deep_copy from dynrankview to view + View1 vdxx("vdxx",testdim); + auto hvdxx = Kokkos::create_mirror_view(vdxx); + Kokkos::deep_copy(hvdxx , hdxx); + ASSERT_EQ( rank(hdxx) , rank(hvdxx) ); + ASSERT_EQ( hvdxx.dimension_0() , testdim ); + ASSERT_EQ( hdxx.dimension_0() , hvdxx.dimension_0() ); + for (int i = 0; i < testdim; ++i) + { ASSERT_EQ( hvxx(i) , hvdxx(i) ); } } typedef T DataType ; @@ -1059,35 +1325,66 @@ public: // N0 = 1000,N1 = 3,N2 = 5,N3 = 7 unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent sdView d7( "d7" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) ); + ASSERT_EQ( d7.rank() , 7 ); - sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); //Should be rank0 subview + sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); + ASSERT_EQ( ds0.rank() , 0 ); //Basic test - ALL - sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); //compiles and runs + sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); + ASSERT_EQ( dsALL.rank() , 7 ); -// Send a single value for one rank +// Send a value to final rank returning rank 6 subview sdView dsm1 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 1 ); + ASSERT_EQ( dsm1.rank() , 6 ); -// Send a std::pair as a rank +// Send a std::pair as argument to a rank sdView dssp = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , std::pair<unsigned,unsigned>(1,2) ); + ASSERT_EQ( dssp.rank() , 7 ); -// Send a kokkos::pair as a rank; take default layout as input +// Send a kokkos::pair as argument to a rank; take default layout as input dView0 dd0("dd0" , N0 , N1 , N2 , 2 , 2 , 2 , 2 ); //default layout + ASSERT_EQ( dd0.rank() , 7 ); sdView dtkp = Kokkos::Experimental::subdynrankview( dd0 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) ); + ASSERT_EQ( dtkp.rank() , 7 ); // Return rank 7 subview, taking a pair as one argument, layout stride input sdView ds7 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) ); + ASSERT_EQ( ds7.rank() , 7 ); // Default Layout DynRankView dView dv6("dv6" , N0 , N1 , N2 , N3 , 2 , 2 ); + ASSERT_EQ( dv6.rank() , 6 ); // DynRankView with LayoutRight typedef Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , device > drView ; drView dr5( "dr5" , N0 , N1 , N2 , 2 , 2 ); + ASSERT_EQ( dr5.rank() , 5 ); // LayoutStride but arranged as LayoutRight - unsigned order3[] = { 4,3,2,1,0 }, dimen3[] = { N0, N1, N2, 2, 2 }; - sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order3, dimen3) ); + // NOTE: unused arg_layout dimensions must be set to ~size_t(0) so that + // rank deduction can properly take place + unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 }; + Kokkos::LayoutStride ls = Kokkos::LayoutStride::order_dimensions(5, order5, dimen5); + ls.dimension[5] = ~size_t(0); + ls.dimension[6] = ~size_t(0); + ls.dimension[7] = ~size_t(0); + sdView d5("d5", ls); + ASSERT_EQ( d5.rank() , 5 ); + +// LayoutStride arranged as LayoutRight - commented out as example that fails unit test +// unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 }; +// sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order5, dimen5) ); +// +// Fails the following unit test: +// ASSERT_EQ( d5.rank() , dr5.rank() ); +// +// Explanation: In construction of the Kokkos::LayoutStride below, since the +// remaining dimensions are not specified, they will default to values of 0 +// rather than ~size_t(0). +// When passed to the DynRankView constructor the default dimensions (of 0) +// will be counted toward the dynamic rank and returning an incorrect value +// (i.e. rank 7 rather than 5). // Check LayoutRight dr5 and LayoutStride d5 dimensions agree (as they should) ASSERT_EQ( d5.dimension_0() , dr5.dimension_0() ); @@ -1100,21 +1397,21 @@ public: // Rank 5 subview of rank 5 dynamic rank view, layout stride input sdView ds5 = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) ); + ASSERT_EQ( ds5.rank() , 5 ); // Pass in extra ALL arguments beyond the rank of the DynRank View. // This behavior is allowed - ignore the extra ALL arguments when // the src.rank() < number of arguments, but be careful! sdView ds5plus = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) , Kokkos::ALL() ); + ASSERT_EQ( ds5.rank() , ds5plus.rank() ); ASSERT_EQ( ds5.dimension_0() , ds5plus.dimension_0() ); ASSERT_EQ( ds5.dimension_4() , ds5plus.dimension_4() ); ASSERT_EQ( ds5.dimension_5() , ds5plus.dimension_5() ); - ASSERT_EQ( ds5.rank() , ds5plus.rank() ); - ASSERT_EQ( ds5.rank() , 5 ); #if ! defined( KOKKOS_HAVE_CUDA ) || defined ( KOKKOS_USE_CUDA_UVM ) - ASSERT_EQ( & ds5(1,1,1,1) - & ds5plus(1,1,1,1) , 0 ); ASSERT_EQ( & ds5(1,1,1,1,0) - & ds5plus(1,1,1,1,0) , 0 ); + ASSERT_EQ( & ds5(1,1,1,1,0,0) - & ds5plus(1,1,1,1,0,0) , 0 ); // passing argument to rank beyond the view's rank is allowed iff it is a 0. #endif // Similar test to rank 5 above, but create rank 4 subview @@ -1131,9 +1428,9 @@ public: static void run_test_subview_strided() { - typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host > drview_left ; - typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host > drview_right ; - typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host > drview_stride ; + typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host_drv_space > drview_left ; + typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host_drv_space > drview_right ; + typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host_drv_space > drview_stride ; drview_left xl2( "xl2", 100 , 200 ); drview_right xr2( "xr2", 100 , 200 ); @@ -1159,35 +1456,37 @@ public: drview_left xl4( "xl4", 10 , 20 , 30 , 40 ); drview_right xr4( "xr4", 10 , 20 , 30 , 40 ); - drview_stride yl4 = Kokkos::Experimental::subdynrankview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() ); - drview_stride yr4 = Kokkos::Experimental::subdynrankview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() ); + //Replace subdynrankview with subview - test + drview_stride yl4 = Kokkos::Experimental::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() ); + drview_stride yr4 = Kokkos::Experimental::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() ); ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() ); ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() ); ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() ); ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() ); + ASSERT_EQ( yl4.rank() , 2); + ASSERT_EQ( yr4.rank() , 2); ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 ); ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 ); - } static void run_test_vector() { static const unsigned Length = 1000 , Count = 8 ; - typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host > multivector_type ; + typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host_drv_space > multivector_type ; - typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host > multivector_right_type ; + typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host_drv_space > multivector_right_type ; multivector_type mv = multivector_type( "mv" , Length , Count ); multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count ); - typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > svector_type ; - typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > smultivector_type ; - typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_right_type ; //LayoutStride, not right; setup to match original ViewAPI calls... update - typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_type ; - typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_smultivector_type ; + typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > svector_type ; + typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > smultivector_type ; + typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_right_type ; + typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_type ; + typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_smultivector_type ; svector_type v1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 0 ); svector_type v2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 1 ); @@ -1251,7 +1550,6 @@ public: const_smultivector_type cmv( mv ); typename smultivector_type::const_type cmvX( cmv ); typename const_smultivector_type::const_type ccmvX( cmv ); - } }; diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp index fd37f16f0a..7e3ca005f4 100644 --- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -61,8 +61,7 @@ struct TestDynamicView typedef typename Space::execution_space execution_space ; typedef typename Space::memory_space memory_space ; - typedef Kokkos::Experimental::MemoryPool< memory_space , execution_space > - memory_pool_type ; + typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type; typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type; @@ -129,11 +128,9 @@ struct TestDynamicView typedef Kokkos::TeamPolicy<execution_space,TEST> TestPolicy ; typedef Kokkos::TeamPolicy<execution_space,VERIFY> VerifyPolicy ; - const unsigned int chunk_size = 1024 ; - // printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size); - memory_pool_type pool( memory_space() , chunk_size , arg_total_size * sizeof(Scalar) ); + memory_pool_type pool( memory_space() , arg_total_size * sizeof(Scalar) * 1.2 ); // printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size); diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in index 961ad58ec5..27e3ba1c31 100644 --- a/lib/kokkos/core/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in @@ -34,6 +34,7 @@ #cmakedefine KOKKOS_HAVE_Winthread #cmakedefine KOKKOS_HAVE_OPENMP #cmakedefine KOKKOS_HAVE_HWLOC +#cmakedefine KOKKOS_HAVE_DEBUG #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK #cmakedefine KOKKOS_HAVE_CXX11 #cmakedefine KOKKOS_HAVE_CUSPARSE diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index 34aa81e92c..d93ca14d96 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -8,11 +8,22 @@ SET(SOURCES PerfTestCuda.cpp ) +# Per #374, we always want to build this test, but we only want to run +# it as a PERFORMANCE test. That's why we separate building the test +# from running the test. + +TRIBITS_ADD_EXECUTABLE( + PerfTestExec + SOURCES ${SOURCES} + COMM serial mpi + TESTONLYLIBS kokkos_gtest + ) + TRIBITS_ADD_EXECUTABLE_AND_TEST( PerfTest - SOURCES ${SOURCES} + NAME PerfTestExec COMM serial mpi NUM_MPI_PROCS 1 + CATEGORIES PERFORMANCE FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest ) diff --git a/lib/kokkos/core/perf_test/PerfTestCuda.cpp b/lib/kokkos/core/perf_test/PerfTestCuda.cpp index 4a4bc13cd4..524beb8b90 100644 --- a/lib/kokkos/core/perf_test/PerfTestCuda.cpp +++ b/lib/kokkos/core/perf_test/PerfTestCuda.cpp @@ -159,7 +159,7 @@ struct TextureFetch Kokkos::Cuda::fence(); - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; for (int j=0; j<10; ++j) { RandomReduce f(array,indexes); f.apply(reduce); diff --git a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp index 03805dcdf5..516696b141 100644 --- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp +++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp @@ -153,7 +153,7 @@ struct ModifiedGramSchmidt Kokkos::deep_copy( one , (Scalar) 1 ); - Kokkos::Impl::Timer timer ; + Kokkos::Timer timer ; for ( size_type j = 0 ; j < count ; ++j ) { // Reduction : tmp = dot( Q(:,j) , Q(:,j) ); diff --git a/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp index d13d9a49e8..ed5371f29c 100644 --- a/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp +++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp @@ -252,7 +252,7 @@ struct HexGrad execution_space::fence(); for ( int i = 0 ; i < iter ; ++i ) { - Kokkos::Impl::Timer timer ; + Kokkos::Timer timer ; Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) ); execution_space::fence(); const double dt = timer.seconds(); diff --git a/lib/kokkos/core/perf_test/test_atomic.cpp b/lib/kokkos/core/perf_test/test_atomic.cpp index 882a5c615e..ab73f2505e 100644 --- a/lib/kokkos/core/perf_test/test_atomic.cpp +++ b/lib/kokkos/core/perf_test/test_atomic.cpp @@ -414,24 +414,27 @@ void Loop(int loop, int test, const char* type_name) { Kokkos::Impl::Timer timer; T res = LoopVariant<T>(loop,test); - double time1 = timer.seconds(); + double time = timer.seconds(); timer.reset(); T resNonAtomic = LoopVariantNonAtomic<T>(loop,test); - double time2 = timer.seconds(); + double timeNonAtomic = timer.seconds(); timer.reset(); T resSerial = LoopVariantSerial<T>(loop,test); - double time3 = timer.seconds(); + double timeSerial = timer.seconds(); - time1*=1e6/loop; - time2*=1e6/loop; - time3*=1e6/loop; + time *=1e6/loop; + timeNonAtomic*=1e6/loop; + timeSerial *=1e6/loop; //textcolor_standard(); bool passed = true; if(resSerial!=res) passed = false; //if(!passed) textcolor(RESET,BLACK,YELLOW); - printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T)); + printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)", + type_name,test,passed?"PASSED":"FAILED",loop, + 1.0*resSerial,1.0*res,1.0*resNonAtomic, + timeSerial,time,timeNonAtomic,(int)sizeof(T)); //if(!passed) textcolor_standard(); printf("\n"); } @@ -452,7 +455,7 @@ void Test(int loop, int test, const char* type_name) { int main(int argc, char* argv[]) { int type = -1; - int loop = 1000000; + int loop = 100000; int test = -1; for(int i=0;i<argc;i++) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp index 9930cdf1ba..d1a560ee04 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp @@ -124,15 +124,31 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits: #endif + +namespace Kokkos { +namespace Impl { + struct CudaLockArraysStruct { + int* atomic; + int* scratch; + int* threadid; + }; +} +} __device__ __constant__ #ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE extern #endif -int* kokkos_impl_cuda_atomic_lock_array ; +Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ; #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF #define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39 +namespace Kokkos { +namespace Impl { + void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false); +} +} + namespace Kokkos { namespace Impl { __device__ inline @@ -140,8 +156,7 @@ bool lock_address_cuda_space(void* ptr) { size_t offset = size_t(ptr); offset = offset >> 2; offset = offset & CUDA_SPACE_ATOMIC_MASK; - //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK; - return (0 == atomicCAS(&kokkos_impl_cuda_atomic_lock_array[offset],0,1)); + return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1)); } __device__ inline @@ -149,8 +164,7 @@ void unlock_address_cuda_space(void* ptr) { size_t offset = size_t(ptr); offset = offset >> 2; offset = offset & CUDA_SPACE_ATOMIC_MASK; - //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK; - atomicExch( &kokkos_impl_cuda_atomic_lock_array[ offset ], 0); + atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0); } } @@ -232,8 +246,11 @@ struct CudaParallelLaunch< DriverType , true > { cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) ); #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE - int* lock_array_ptr = lock_array_cuda_space_ptr(); - cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) ); + Kokkos::Impl::CudaLockArraysStruct locks; + locks.atomic = atomic_lock_array_cuda_space_ptr(false); + locks.scratch = scratch_lock_array_cuda_space_ptr(false); + locks.threadid = threadid_lock_array_cuda_space_ptr(false); + cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) ); #endif // Invoke the driver function on the device @@ -271,8 +288,11 @@ struct CudaParallelLaunch< DriverType , false > { #endif #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE - int* lock_array_ptr = lock_array_cuda_space_ptr(); - cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) ); + Kokkos::Impl::CudaLockArraysStruct locks; + locks.atomic = atomic_lock_array_cuda_space_ptr(false); + locks.scratch = scratch_lock_array_cuda_space_ptr(false); + locks.threadid = threadid_lock_array_cuda_space_ptr(false); + cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) ); #endif cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver ); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 863488c3b0..a4f372d65d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -51,10 +51,10 @@ /* only compile this file if CUDA is enabled for Kokkos */ #ifdef KOKKOS_HAVE_CUDA +#include <Kokkos_Core.hpp> #include <Kokkos_Cuda.hpp> #include <Kokkos_CudaSpace.hpp> -#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp> #include <Cuda/Kokkos_Cuda_Internal.hpp> #include <impl/Kokkos_Error.hpp> @@ -107,68 +107,6 @@ void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) { namespace Kokkos { -#if ! KOKKOS_USING_EXP_VIEW - -namespace { - -void texture_object_attach_impl( Impl::AllocationTracker const & tracker - , unsigned type_size - , ::cudaChannelFormatDesc const & desc - ) -{ - enum { TEXTURE_BOUND_1D = 2u << 27 }; - - if ( tracker.attribute() == NULL ) { - // check for correct allocator - const bool ok_alloc = tracker.allocator()->support_texture_binding(); - - const bool ok_count = (tracker.alloc_size() / type_size) < TEXTURE_BOUND_1D; - - if (ok_alloc && ok_count) { - Impl::TextureAttribute * attr = new Impl::TextureAttribute( tracker.alloc_ptr(), tracker.alloc_size(), desc ); - tracker.set_attribute( attr ); - } - else { - std::ostringstream oss; - oss << "Error: Cannot attach texture object"; - if (!ok_alloc) { - oss << ", incompatabile allocator " << tracker.allocator()->name(); - } - if (!ok_count) { - oss << ", array " << tracker.label() << " too large"; - } - oss << "."; - Kokkos::Impl::throw_runtime_exception( oss.str() ); - } - } - - if ( NULL == dynamic_cast<Impl::TextureAttribute *>(tracker.attribute()) ) { - std::ostringstream oss; - oss << "Error: Allocation " << tracker.label() << " already has an attribute attached."; - Kokkos::Impl::throw_runtime_exception( oss.str() ); - } - -} - -} // unnamed namespace - -/*--------------------------------------------------------------------------*/ - -Impl::AllocationTracker CudaSpace::allocate_and_track( const std::string & label, const size_t size ) -{ - return Impl::AllocationTracker( allocator(), size, label); -} - -void CudaSpace::texture_object_attach( Impl::AllocationTracker const & tracker - , unsigned type_size - , ::cudaChannelFormatDesc const & desc - ) -{ - texture_object_attach_impl( tracker, type_size, desc ); -} - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - void CudaSpace::access_error() { const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" ); @@ -183,23 +121,6 @@ void CudaSpace::access_error( const void * const ) /*--------------------------------------------------------------------------*/ -#if ! KOKKOS_USING_EXP_VIEW - -Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size ) -{ - return Impl::AllocationTracker( allocator(), size, label); -} - -void CudaUVMSpace::texture_object_attach( Impl::AllocationTracker const & tracker - , unsigned type_size - , ::cudaChannelFormatDesc const & desc - ) -{ - texture_object_attach_impl( tracker, type_size, desc ); -} - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - bool CudaUVMSpace::available() { #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__) @@ -212,15 +133,6 @@ bool CudaUVMSpace::available() /*--------------------------------------------------------------------------*/ -#if ! KOKKOS_USING_EXP_VIEW - -Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size ) -{ - return Impl::AllocationTracker( allocator(), size, label); -} - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - } // namespace Kokkos /*--------------------------------------------------------------------------*/ @@ -824,16 +736,26 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo namespace Kokkos { namespace { - __global__ void init_lock_array_kernel() { + __global__ void init_lock_array_kernel_atomic() { unsigned i = blockIdx.x*blockDim.x + threadIdx.x; if(i<CUDA_SPACE_ATOMIC_MASK+1) - kokkos_impl_cuda_atomic_lock_array[i] = 0; + kokkos_impl_cuda_lock_arrays.atomic[i] = 0; + } + + __global__ void init_lock_array_kernel_scratch_threadid(int N) { + unsigned i = blockIdx.x*blockDim.x + threadIdx.x; + + if(i<N) { + kokkos_impl_cuda_lock_arrays.scratch[i] = 0; + kokkos_impl_cuda_lock_arrays.threadid[i] = 0; + } } } + namespace Impl { -int* lock_array_cuda_space_ptr(bool deallocate) { +int* atomic_lock_array_cuda_space_ptr(bool deallocate) { static int* ptr = NULL; if(deallocate) { cudaFree(ptr); @@ -845,15 +767,62 @@ int* lock_array_cuda_space_ptr(bool deallocate) { return ptr; } -void init_lock_array_cuda_space() { - int is_initialized = 0; +int* scratch_lock_array_cuda_space_ptr(bool deallocate) { + static int* ptr = NULL; + if(deallocate) { + cudaFree(ptr); + ptr = NULL; + } + + if(ptr==NULL && !deallocate) + cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency())); + return ptr; +} + +int* threadid_lock_array_cuda_space_ptr(bool deallocate) { + static int* ptr = NULL; + if(deallocate) { + cudaFree(ptr); + ptr = NULL; + } + + if(ptr==NULL && !deallocate) + cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency())); + return ptr; +} + +void init_lock_arrays_cuda_space() { + static int is_initialized = 0; if(! is_initialized) { - int* lock_array_ptr = lock_array_cuda_space_ptr(); - cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) ); - init_lock_array_kernel<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>(); + Kokkos::Impl::CudaLockArraysStruct locks; + locks.atomic = atomic_lock_array_cuda_space_ptr(false); + locks.scratch = scratch_lock_array_cuda_space_ptr(false); + locks.threadid = threadid_lock_array_cuda_space_ptr(false); + cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) ); + init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>(); + init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency()); } } +void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) { + static void* ptr = NULL; + static size_t current_size = 0; + if(current_size == 0) { + current_size = bytes; + ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size); + } + if(bytes > current_size) { + current_size = bytes; + ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(ptr,current_size); + } + if((bytes < current_size) && (force_shrink)) { + current_size = bytes; + Kokkos::kokkos_free<Kokkos::CudaSpace>(ptr); + ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size); + } + return ptr; +} + } } #endif // KOKKOS_HAVE_CUDA diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp index 5746176274..10999ee57b 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp @@ -50,7 +50,6 @@ #ifdef KOKKOS_HAVE_CUDA #include <impl/Kokkos_Traits.hpp> -#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase namespace Kokkos { namespace Impl { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp deleted file mode 100644 index 05c73121bc..0000000000 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#include <Kokkos_Macros.hpp> - -#if ! KOKKOS_USING_EXP_VIEW - -/* only compile this file if CUDA is enabled for Kokkos */ -#ifdef KOKKOS_HAVE_CUDA - -#include <impl/Kokkos_Error.hpp> -#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp> -#include <Cuda/Kokkos_Cuda_Error.hpp> - -#include <sstream> - -namespace Kokkos { namespace Impl { - - -/*--------------------------------------------------------------------------*/ - -TextureAttribute::TextureAttribute( void * const alloc_ptr - , size_t alloc_size - , cudaChannelFormatDesc const & desc - ) - : m_tex_obj(0) -{ - cuda_device_synchronize(); - - struct cudaResourceDesc resDesc ; - struct cudaTextureDesc texDesc ; - - memset( & resDesc , 0 , sizeof(resDesc) ); - memset( & texDesc , 0 , sizeof(texDesc) ); - - resDesc.resType = cudaResourceTypeLinear ; - resDesc.res.linear.desc = desc ; - resDesc.res.linear.sizeInBytes = alloc_size ; - resDesc.res.linear.devPtr = alloc_ptr ; - - CUDA_SAFE_CALL( cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL) ); - - cuda_device_synchronize(); -} - - -TextureAttribute::~TextureAttribute() -{ - if (m_tex_obj) { - cudaDestroyTextureObject( m_tex_obj ); - } -} - -/*--------------------------------------------------------------------------*/ - -void * CudaMallocAllocator::allocate( size_t size ) -{ - void * ptr = NULL; - - CUDA_SAFE_CALL( cudaMalloc( &ptr, size ) ); - - return ptr; -} - -void CudaMallocAllocator::deallocate( void * ptr, size_t /*size*/ ) -{ - try { - CUDA_SAFE_CALL( cudaFree( ptr ) ); - } catch(...) {} -} - -void * CudaMallocAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size) -{ - void * ptr = old_ptr; - if (old_size != new_size) { - ptr = allocate( new_size ); - size_t copy_size = old_size < new_size ? old_size : new_size; - - CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) ); - - deallocate( old_ptr, old_size ); - } - return ptr; -} - -/*--------------------------------------------------------------------------*/ - -void * CudaUVMAllocator::allocate( size_t size ) -{ -#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) - void * ptr = NULL; - CUDA_SAFE_CALL( cudaMallocManaged( &ptr, size, cudaMemAttachGlobal ) ); - return ptr; -#else - throw_runtime_exception( "CUDA VERSION does not support UVM" ); - return NULL; -#endif -} - -void CudaUVMAllocator::deallocate( void * ptr, size_t /*size*/ ) -{ - try { - CUDA_SAFE_CALL( cudaFree( ptr ) ); - } catch(...) {} -} - -void * CudaUVMAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size) -{ - void * ptr = old_ptr; - if (old_size != new_size) { - ptr = allocate( new_size ); - size_t copy_size = old_size < new_size ? old_size : new_size; - - CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) ); - - deallocate( old_ptr, old_size ); - } - return ptr; -} - -/*--------------------------------------------------------------------------*/ - -void * CudaHostAllocator::allocate( size_t size ) -{ - void * ptr = NULL; - CUDA_SAFE_CALL( cudaHostAlloc( &ptr , size , cudaHostAllocDefault ) ); - return ptr; -} - -void CudaHostAllocator::deallocate( void * ptr, size_t /*size*/ ) -{ - try { - CUDA_SAFE_CALL( cudaFreeHost( ptr ) ); - } catch(...) {} -} - -void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size) -{ - void * ptr = old_ptr; - if (old_size != new_size) { - ptr = allocate( new_size ); - size_t copy_size = old_size < new_size ? old_size : new_size; - - CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyHostToHost ) ); - - deallocate( old_ptr, old_size ); - } - return ptr; -} - -/*--------------------------------------------------------------------------*/ - -}} // namespace Kokkos::Impl - -#endif //KOKKOS_HAVE_CUDA - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp deleted file mode 100644 index 80bc986ad7..0000000000 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp +++ /dev/null @@ -1,190 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_CUDA_BASIC_ALLOCATORS_HPP -#define KOKKOS_CUDA_BASIC_ALLOCATORS_HPP - -#include <Kokkos_Macros.hpp> - -#if ! KOKKOS_USING_EXP_VIEW - -/* only compile this file if CUDA is enabled for Kokkos */ -#ifdef KOKKOS_HAVE_CUDA - -#include <impl/Kokkos_Traits.hpp> -#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase - -namespace Kokkos { namespace Impl { - - -// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t' -// to be an 'unsigned long long'. This chould change with -// future version of Cuda and this typedef would have to -// change accordingly. - -#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION ) - -typedef enable_if< - sizeof(::cudaTextureObject_t) == sizeof(const void *) , - ::cudaTextureObject_t >::type cuda_texture_object_type ; - -#else - -typedef const void * cuda_texture_object_type ; - -#endif - - -struct TextureAttribute : public AllocatorAttributeBase -{ - cuda_texture_object_type m_tex_obj ; - - TextureAttribute( void * const alloc_ptr - , size_t alloc_size - , cudaChannelFormatDesc const & desc - ); - - ~TextureAttribute(); -}; - -/// class CudaUnmanagedAllocator -/// does nothing when deallocate(ptr,size) is called -struct CudaUnmanagedAllocator -{ - static const char * name() - { - return "Cuda Unmanaged Allocator"; - } - - static void deallocate(void * /*ptr*/, size_t /*size*/) {} - - static bool support_texture_binding() { return true; } -}; - -/// class CudaUnmanagedAllocator -/// does nothing when deallocate(ptr,size) is called -struct CudaUnmanagedUVMAllocator -{ - static const char * name() - { - return "Cuda Unmanaged UVM Allocator"; - } - - static void deallocate(void * /*ptr*/, size_t /*size*/) {} - - static bool support_texture_binding() { return true; } -}; - -/// class CudaUnmanagedHostAllocator -/// does nothing when deallocate(ptr,size) is called -class CudaUnmanagedHostAllocator -{ -public: - static const char * name() - { - return "Cuda Unmanaged Host Allocator"; - } - // Unmanaged deallocate does nothing - static void deallocate(void * /*ptr*/, size_t /*size*/) {} -}; - -/// class CudaMallocAllocator -class CudaMallocAllocator -{ -public: - static const char * name() - { - return "Cuda Malloc Allocator"; - } - - static void* allocate(size_t size); - - static void deallocate(void * ptr, size_t); - - static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); - - static bool support_texture_binding() { return true; } -}; - -/// class CudaUVMAllocator -class CudaUVMAllocator -{ -public: - static const char * name() - { - return "Cuda UVM Allocator"; - } - - static void* allocate(size_t size); - - static void deallocate(void * ptr, size_t); - - static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); - - static bool support_texture_binding() { return true; } -}; - -/// class CudaHostAllocator -class CudaHostAllocator -{ -public: - static const char * name() - { - return "Cuda Host Allocator"; - } - - static void* allocate(size_t size); - - static void deallocate(void * ptr, size_t); - - static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); -}; - - -}} // namespace Kokkos::Impl - -#endif //KOKKOS_HAVE_CUDA - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - -#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp index 02c85d268c..2d8d07d077 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp @@ -51,8 +51,8 @@ #include <Cuda/Kokkos_Cuda_Error.hpp> #include <Cuda/Kokkos_Cuda_Internal.hpp> -#include <impl/Kokkos_AllocationTracker.hpp> #include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_Profiling_Interface.hpp> /*--------------------------------------------------------------------------*/ /* Standard 'C' libraries */ @@ -70,7 +70,7 @@ __device__ __constant__ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ; __device__ __constant__ -int* kokkos_impl_cuda_atomic_lock_array ; +Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ; #endif @@ -190,7 +190,7 @@ namespace { class CudaInternalDevices { public: - enum { MAXIMUM_DEVICE_COUNT = 8 }; + enum { MAXIMUM_DEVICE_COUNT = 64 }; struct cudaDeviceProp m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ; int m_cudaDevCount ; @@ -206,6 +206,9 @@ CudaInternalDevices::CudaInternalDevices() CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) ); + if(m_cudaDevCount > MAXIMUM_DEVICE_COUNT) { + Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos."); + } for ( int i = 0 ; i < m_cudaDevCount ; ++i ) { CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) ); } @@ -226,14 +229,6 @@ private: CudaInternal( const CudaInternal & ); CudaInternal & operator = ( const CudaInternal & ); -#if ! KOKKOS_USING_EXP_VIEW - - AllocationTracker m_scratchFlagsTracker; - AllocationTracker m_scratchSpaceTracker; - AllocationTracker m_scratchUnifiedTracker; - -#endif - public: @@ -255,6 +250,8 @@ public: size_type * m_scratchUnified ; cudaStream_t * m_stream ; + static int was_initialized; + static int was_finalized; static CudaInternal & singleton(); @@ -293,6 +290,8 @@ public: size_type * scratch_unified( const size_type size ); }; +int CudaInternal::was_initialized = 0; +int CudaInternal::was_finalized = 0; //---------------------------------------------------------------------------- @@ -367,6 +366,10 @@ CudaInternal & CudaInternal::singleton() void CudaInternal::initialize( int cuda_device_id , int stream_count ) { + if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n"); + was_initialized = 1; + if ( is_initialized() ) return; + enum { WordSize = sizeof(size_type) }; if ( ! HostSpace::execution_space::is_initialized() ) { @@ -526,11 +529,14 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count ) cudaThreadSetCacheConfig(cudaFuncCachePreferShared); // Init the array for used for arbitrarily sized atomics - Impl::init_lock_array_cuda_space(); + Impl::init_lock_arrays_cuda_space(); #ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE - int* lock_array_ptr = lock_array_cuda_space_ptr(); - cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) ); + Kokkos::Impl::CudaLockArraysStruct locks; + locks.atomic = atomic_lock_array_cuda_space_ptr(false); + locks.scratch = scratch_lock_array_cuda_space_ptr(false); + locks.threadid = threadid_lock_array_cuda_space_ptr(false); + cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) ); #endif } @@ -548,14 +554,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size ) m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ; -#if ! KOKKOS_USING_EXP_VIEW - - m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount ); - - m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr()); - -#else - typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ; Record * const r = Record::allocate( Kokkos::CudaSpace() @@ -566,9 +564,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size ) m_scratchFlags = reinterpret_cast<size_type *>( r->data() ); -#endif - - CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) ); } @@ -582,26 +577,15 @@ CudaInternal::scratch_space( const Cuda::size_type size ) m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ; -#if ! KOKKOS_USING_EXP_VIEW - - m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount ); - - m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr()); - -#else - - typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ; - - Record * const r = Record::allocate( Kokkos::CudaSpace() - , "InternalScratchSpace" - , ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) ); - - Record::increment( r ); + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ; - m_scratchSpace = reinterpret_cast<size_type *>( r->data() ); + Record * const r = Record::allocate( Kokkos::CudaSpace() + , "InternalScratchSpace" + , ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) ); -#endif + Record::increment( r ); + m_scratchSpace = reinterpret_cast<size_type *>( r->data() ); } return m_scratchSpace ; @@ -615,14 +599,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size ) m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ; -#if ! KOKKOS_USING_EXP_VIEW - - m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount ); - - m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() ); - -#else - typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ; Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace() @@ -632,9 +608,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size ) Record::increment( r ); m_scratchUnified = reinterpret_cast<size_type *>( r->data() ); - -#endif - } return m_scratchUnified ; @@ -644,9 +617,13 @@ CudaInternal::scratch_unified( const Cuda::size_type size ) void CudaInternal::finalize() { + was_finalized = 1; if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) { - lock_array_cuda_space_ptr(true); + atomic_lock_array_cuda_space_ptr(false); + scratch_lock_array_cuda_space_ptr(false); + threadid_lock_array_cuda_space_ptr(false); + if ( m_stream ) { for ( size_type i = 1 ; i < m_streamCount ; ++i ) { cudaStreamDestroy( m_stream[i] ); @@ -655,14 +632,6 @@ void CudaInternal::finalize() ::free( m_stream ); } -#if ! KOKKOS_USING_EXP_VIEW - - m_scratchSpaceTracker.clear(); - m_scratchFlagsTracker.clear(); - m_scratchUnifiedTracker.clear(); - -#else - typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ; typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ; @@ -670,8 +639,6 @@ void CudaInternal::finalize() RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) ); RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) ); -#endif - m_cudaDev = -1 ; m_multiProcCount = 0 ; m_maxWarpCount = 0 ; @@ -730,7 +697,13 @@ int Cuda::is_initialized() { return Impl::CudaInternal::singleton().is_initialized(); } void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances ) -{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); } +{ + Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); + + #if (KOKKOS_ENABLE_PROFILING) + Kokkos::Profiling::initialize(); + #endif +} std::vector<unsigned> Cuda::detect_device_arch() @@ -763,7 +736,13 @@ Cuda::size_type Cuda::device_arch() } void Cuda::finalize() -{ Impl::CudaInternal::singleton().finalize(); } +{ + Impl::CudaInternal::singleton().finalize(); + + #if (KOKKOS_ENABLE_PROFILING) + Kokkos::Profiling::finalize(); + #endif +} Cuda::Cuda() : m_device( Impl::CudaInternal::singleton().m_cudaDev ) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp index 328857d997..8b10d47f88 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp @@ -57,17 +57,20 @@ template<class DriverType, bool Large> struct CudaGetMaxBlockSize; template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))> -int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { - return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra); +int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length, + const size_t shmem_extra_block, const size_t shmem_extra_thread) { + return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread); } template<class DriverType> struct CudaGetMaxBlockSize<DriverType,true> { - static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { + static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, + const size_t shmem_extra_block, const size_t shmem_extra_thread) { int numBlocks; int blockSize=32; - int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); + int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_constant_memory<DriverType>, @@ -76,7 +79,8 @@ struct CudaGetMaxBlockSize<DriverType,true> { while (blockSize<1024 && numBlocks>0) { blockSize*=2; - sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length); + sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, @@ -91,11 +95,13 @@ struct CudaGetMaxBlockSize<DriverType,true> { template<class DriverType> struct CudaGetMaxBlockSize<DriverType,false> { - static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { + static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, + const size_t shmem_extra_block, const size_t shmem_extra_thread) { int numBlocks; int blockSize=32; - int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); + int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_local_memory<DriverType>, @@ -104,7 +110,8 @@ struct CudaGetMaxBlockSize<DriverType,false> { while (blockSize<1024 && numBlocks>0) { blockSize*=2; - sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); + sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, @@ -123,13 +130,15 @@ template<class DriverType, bool Large> struct CudaGetOptBlockSize; template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))> -int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { - return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra); +int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length, + const size_t shmem_extra_block, const size_t shmem_extra_thread) { + return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread); } template<class DriverType> struct CudaGetOptBlockSize<DriverType,true> { - static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { + static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, + const size_t shmem_extra_block, const size_t shmem_extra_thread) { int blockSize=16; int numBlocks; int sharedmem; @@ -140,7 +149,8 @@ struct CudaGetOptBlockSize<DriverType,true> { blockSize*=2; //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far - sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); + sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_constant_memory<DriverType>, @@ -157,7 +167,8 @@ struct CudaGetOptBlockSize<DriverType,true> { template<class DriverType> struct CudaGetOptBlockSize<DriverType,false> { - static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { + static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, + const size_t shmem_extra_block, const size_t shmem_extra_thread) { int blockSize=16; int numBlocks; int sharedmem; @@ -166,7 +177,8 @@ struct CudaGetOptBlockSize<DriverType,false> { while(blockSize<1024) { blockSize*=2; - sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); + sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp index 99296dd273..7afa06fdf5 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -99,13 +99,13 @@ public: __device__ inline const execution_space::scratch_memory_space & team_shmem() const - { return m_team_shared.set_team_thread_mode(1,0) ; } + { return m_team_shared.set_team_thread_mode(0,1,0) ; } __device__ inline - const execution_space::scratch_memory_space & team_scratch(int) const - { return m_team_shared.set_team_thread_mode(1,0) ; } + const execution_space::scratch_memory_space & team_scratch(const int& level) const + { return m_team_shared.set_team_thread_mode(level,1,0) ; } __device__ inline - const execution_space::scratch_memory_space & thread_scratch(int) const - { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; } + const execution_space::scratch_memory_space & thread_scratch(const int& level) const + { return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; } __device__ inline int league_rank() const { return m_league_rank ; } __device__ inline int league_size() const { return m_league_size ; } @@ -122,6 +122,7 @@ public: } team_barrier(); value = sh_val; + team_barrier(); } #ifdef KOKKOS_HAVE_CXX11 @@ -203,10 +204,12 @@ public: CudaTeamMember( void * shared , const int shared_begin , const int shared_size + , void* scratch_level_1_ptr + , const int scratch_level_1_size , const int arg_league_rank , const int arg_league_size ) : m_team_reduce( shared ) - , m_team_shared( ((char *)shared) + shared_begin , shared_size ) + , m_team_shared( ((char *)shared) + shared_begin , shared_size, scratch_level_1_ptr, scratch_level_1_size) , m_league_rank( arg_league_rank ) , m_league_size( arg_league_size ) {} @@ -214,11 +217,11 @@ public: #else const execution_space::scratch_memory_space & team_shmem() const - { return m_team_shared.set_team_thread_mode(1,0) ; } - const execution_space::scratch_memory_space & team_scratch(int) const - { return m_team_shared.set_team_thread_mode(1,0) ; } - const execution_space::scratch_memory_space & thread_scratch(int) const - { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; } + { return m_team_shared.set_team_thread_mode(0, 1,0) ; } + const execution_space::scratch_memory_space & team_scratch(const int& level) const + { return m_team_shared.set_team_thread_mode(level,1,0) ; } + const execution_space::scratch_memory_space & thread_scratch(const int& level) const + { return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; } int league_rank() const {return 0;} int league_size() const {return 1;} @@ -245,6 +248,8 @@ public: CudaTeamMember( void * shared , const int shared_begin , const int shared_end + , void* scratch_level_1_ptr + , const int scratch_level_1_size , const int arg_league_rank , const int arg_league_size ); @@ -272,8 +277,8 @@ private: int m_league_size ; int m_team_size ; int m_vector_length ; - int m_team_scratch_size ; - int m_thread_scratch_size ; + int m_team_scratch_size[2] ; + int m_thread_scratch_size[2] ; int m_chunk_size; public: @@ -285,8 +290,10 @@ public: m_league_size = p.m_league_size; m_team_size = p.m_team_size; m_vector_length = p.m_vector_length; - m_team_scratch_size = p.m_team_scratch_size; - m_thread_scratch_size = p.m_thread_scratch_size; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; m_chunk_size = p.m_chunk_size; return *this; } @@ -332,14 +339,23 @@ public: inline int vector_length() const { return m_vector_length ; } inline int team_size() const { return m_team_size ; } inline int league_size() const { return m_league_size ; } - inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size; } + inline int scratch_size(int level, int team_size_ = -1) const { + if(team_size_<0) team_size_ = m_team_size; + return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level]; + } + inline size_t team_scratch_size(int level) const { + return m_team_scratch_size[level]; + } + inline size_t thread_scratch_size(int level) const { + return m_thread_scratch_size[level]; + } TeamPolicyInternal() : m_league_size( 0 ) , m_team_size( 0 ) , m_vector_length( 0 ) - , m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + , m_team_scratch_size {0,0} + , m_thread_scratch_size {0,0} , m_chunk_size ( 32 ) {} @@ -351,8 +367,8 @@ public: : m_league_size( league_size_ ) , m_team_size( team_size_request ) , m_vector_length( vector_length_request ) - , m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + , m_team_scratch_size {0,0} + , m_thread_scratch_size {0,0} , m_chunk_size ( 32 ) { // Allow only power-of-two vector_length @@ -378,8 +394,8 @@ public: : m_league_size( league_size_ ) , m_team_size( -1 ) , m_vector_length( vector_length_request ) - , m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + , m_team_scratch_size {0,0} + , m_thread_scratch_size {0,0} , m_chunk_size ( 32 ) { // Allow only power-of-two vector_length @@ -398,8 +414,8 @@ public: : m_league_size( league_size_ ) , m_team_size( team_size_request ) , m_vector_length ( vector_length_request ) - , m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + , m_team_scratch_size {0,0} + , m_thread_scratch_size {0,0} , m_chunk_size ( 32 ) { // Allow only power-of-two vector_length @@ -423,8 +439,8 @@ public: : m_league_size( league_size_ ) , m_team_size( -1 ) , m_vector_length ( vector_length_request ) - , m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + , m_team_scratch_size {0,0} + , m_thread_scratch_size {0,0} , m_chunk_size ( 32 ) { // Allow only power-of-two vector_length @@ -448,26 +464,23 @@ public: /** \brief set per team scratch size for a specific level of the scratch hierarchy */ inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const { - (void) level; TeamPolicyInternal p = *this; - p.m_team_scratch_size = per_team.value; + p.m_team_scratch_size[level] = per_team.value; return p; }; /** \brief set per thread scratch size for a specific level of the scratch hierarchy */ inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const { - (void) level; TeamPolicyInternal p = *this; - p.m_thread_scratch_size = per_thread.value; + p.m_thread_scratch_size[level] = per_thread.value; return p; }; /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */ inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const { - (void) level; TeamPolicyInternal p = *this; - p.m_team_scratch_size = per_team.value; - p.m_thread_scratch_size = per_thread.value; + p.m_team_scratch_size[level] = per_team.value; + p.m_thread_scratch_size[level] = per_thread.value; return p; }; @@ -580,6 +593,8 @@ private: const size_type m_vector_size ; const size_type m_shmem_begin ; const size_type m_shmem_size ; + void* m_scratch_ptr[2] ; + const int m_scratch_size[2] ; template< class TagType > __device__ inline @@ -605,6 +620,8 @@ public: typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>() , m_shmem_begin , m_shmem_size + , m_scratch_ptr[1] + , m_scratch_size[1] , league_rank , m_league_size ) ); } @@ -627,22 +644,24 @@ public: : m_functor( arg_functor ) , m_league_size( arg_policy.league_size() ) , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() : - Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.vector_length(), arg_policy.scratch_size() ) / arg_policy.vector_length() ) + Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() ) , m_vector_size( arg_policy.vector_length() ) , m_shmem_begin( sizeof(double) * ( m_team_size + 2 ) ) - , m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) ) + , m_shmem_size( arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) ) + , m_scratch_ptr{NULL,NULL} + , m_scratch_size{arg_policy.scratch_size(0,m_team_size),arg_policy.scratch_size(1,m_team_size)} { // Functor's reduce memory, team scan memory, and team shared memory depend upon team size. + m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size))); const int shmem_size_total = m_shmem_begin + m_shmem_size ; - if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) { Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); } - if ( m_team_size > - Kokkos::Impl::cuda_get_max_block_size< ParallelFor > - ( arg_functor , arg_policy.vector_length(), arg_policy.scratch_size() ) / arg_policy.vector_length()) { + if ( int(m_team_size) > + int(Kokkos::Impl::cuda_get_max_block_size< ParallelFor > + ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) { Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > requested too large team size.")); } } @@ -657,9 +676,10 @@ public: namespace Kokkos { namespace Impl { -template< class FunctorType , class ... Traits > +template< class FunctorType , class ReducerType, class ... Traits > class ParallelReduce< FunctorType , Kokkos::RangePolicy< Traits ... > + , ReducerType , Kokkos::Cuda > { @@ -671,8 +691,12 @@ private: typedef typename Policy::work_tag WorkTag ; typedef typename Policy::member_type Member ; - typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; public: @@ -686,11 +710,20 @@ public: const FunctorType m_functor ; const Policy m_policy ; + const ReducerType m_reducer ; const pointer_type m_result_ptr ; size_type * m_scratch_space ; size_type * m_scratch_flags ; size_type * m_unified_space ; + // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit + enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) }; + // Some crutch to do function overloading +private: + typedef double DummyShflReductionType; + typedef int DummySHMEMReductionType; + +public: template< class TagType > __device__ inline typename std::enable_if< std::is_same< TagType , void >::value >::type @@ -703,17 +736,20 @@ public: exec_range( const Member & i , reference_type update ) const { m_functor( TagType() , i , update ); } -#if ! defined( KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION ) + __device__ inline + void operator() () const { + run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) ); + } __device__ inline - void operator()(void) const + void run(const DummySHMEMReductionType& ) const { const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) > - word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) ); + word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) ); { reference_type value = - ValueInit::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value ); + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value ); // Number of blocks is bounded so that the reduction can be limited to two passes. // Each thread block is given an approximately equal amount of work to perform. @@ -729,8 +765,8 @@ public: } // Reduce with final value at blockDim.y - 1 location. - if ( cuda_single_inter_block_reduce_scan<false,FunctorType,WorkTag>( - m_functor , blockIdx.x , gridDim.x , + if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>( + ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x , kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) { // This is the final block with the final result at the final threads' location @@ -739,7 +775,7 @@ public: size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ; if ( threadIdx.y == 0 ) { - Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , shared ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared ); } if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } @@ -748,20 +784,18 @@ public: } } -#else /* defined( KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION ) */ - __device__ inline - void operator()(void) const + void run(const DummyShflReductionType&) const { - value_type value = 0; - + value_type value; + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value); // Number of blocks is bounded so that the reduction can be limited to two passes. // Each thread block is given an approximately equal amount of work to perform. // Accumulate the values for this block. // The accumulation ordering does not match the final pass, but is arithmatically equivalent. - const Policy range( m_policy , blockIdx.x , gridDim.x ); + const WorkRange range( m_policy , blockIdx.x , gridDim.x ); for ( Member iwork = range.begin() + threadIdx.y , iwork_end = range.end() ; iwork < iwork_end ; iwork += blockDim.y ) { @@ -769,20 +803,23 @@ public: } pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ; + int max_active_thread = range.end()-range.begin() < blockDim.y ? range.end() - range.begin():blockDim.y; - max_active_thread = max_active_thread == 0?blockDim.y:max_active_thread; - if(Impl::cuda_inter_block_reduction<FunctorType,Impl::JoinAdd<value_type> > - (value,Impl::JoinAdd<value_type>(),m_scratch_space,result,m_scratch_flags,max_active_thread)) { + + max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread; + + value_type init; + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init); + if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag> + (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) { const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; if(id==0) { - Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , (void*) &value ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); *result = value; } } } -#endif - // Determine block size constrained by shared memory: static inline unsigned local_block_size( const FunctorType & f ) @@ -799,20 +836,17 @@ public: if ( nwork ) { const int block_size = local_block_size( m_functor ); - m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( m_functor ) * block_size /* block_size == max block_count */ ); + m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ ); m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) ); - m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( m_functor ) ); + m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) ); // REQUIRED ( 1 , N , 1 ) const dim3 block( 1 , block_size , 1 ); // Required grid.x <= block.y const dim3 grid( std::min( int(block.y) , int( ( nwork + block.y - 1 ) / block.y ) ) , 1 , 1 ); -#ifdef KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION - const int shmem = 0; -#else - const int shmem = cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y ); -#endif + const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y ); + CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute @@ -820,18 +854,18 @@ public: if ( m_result_ptr ) { if ( m_unified_space ) { - const int count = ValueTraits::value_count( m_functor ); + const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; } } else { - const int size = ValueTraits::value_size( m_functor ); + const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ); DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size ); } } } else { if (m_result_ptr) { - ValueInit::init( m_functor , m_result_ptr ); + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr ); } } } @@ -840,21 +874,37 @@ public: ParallelReduce( const FunctorType & arg_functor , const Policy & arg_policy , const HostViewType & arg_result - ) + , typename std::enable_if< + Kokkos::is_view< HostViewType >::value + ,void*>::type = NULL) : m_functor( arg_functor ) , m_policy( arg_policy ) + , m_reducer( InvalidType() ) , m_result_ptr( arg_result.ptr_on_device() ) , m_scratch_space( 0 ) , m_scratch_flags( 0 ) , m_unified_space( 0 ) { } + + ParallelReduce( const FunctorType & arg_functor + , const Policy & arg_policy + , const ReducerType & reducer) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + , m_reducer( reducer ) + , m_result_ptr( reducer.result_view().ptr_on_device() ) + , m_scratch_space( 0 ) + , m_scratch_flags( 0 ) + , m_unified_space( 0 ) + { } }; //---------------------------------------------------------------------------- -template< class FunctorType , class ... Properties > +template< class FunctorType , class ReducerType, class ... Properties > class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Properties ... > + , ReducerType , Kokkos::Cuda > { @@ -864,18 +914,29 @@ private: typedef typename Policy::member_type Member ; typedef typename Policy::work_tag WorkTag ; - typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; + typedef typename ValueTraits::value_type value_type ; + public: typedef FunctorType functor_type ; typedef Cuda::size_type size_type ; + enum { UseShflReduction = (true && ValueTraits::StaticValueSize) }; + private: + typedef double DummyShflReductionType; + typedef int DummySHMEMReductionType; + // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1 // shared memory utilization: @@ -886,6 +947,7 @@ private: // const FunctorType m_functor ; + const ReducerType m_reducer ; const pointer_type m_result_ptr ; size_type * m_scratch_space ; size_type * m_scratch_flags ; @@ -893,8 +955,11 @@ private: size_type m_team_begin ; size_type m_shmem_begin ; size_type m_shmem_size ; + void* m_scratch_ptr[2] ; + int m_scratch_size[2] ; const size_type m_league_size ; const size_type m_team_size ; + const size_type m_vector_size ; template< class TagType > __device__ inline @@ -911,13 +976,18 @@ private: public: __device__ inline - void operator()(void) const + void operator() () const { + run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) ); + } + + __device__ inline + void run(const DummySHMEMReductionType&) const { const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) > - word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) ); + word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) ); reference_type value = - ValueInit::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value ); + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value ); // Iterate this block through the league for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) { @@ -925,6 +995,8 @@ public: ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin , m_shmem_begin , m_shmem_size + , m_scratch_ptr[1] + , m_scratch_size[1] , league_rank , m_league_size ) , value ); @@ -932,7 +1004,7 @@ public: // Reduce with final value at blockDim.y - 1 location. if ( cuda_single_inter_block_reduce_scan<false,FunctorType,WorkTag>( - m_functor , blockIdx.x , gridDim.x , + ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x , kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) { // This is the final block with the final result at the final threads' location @@ -941,7 +1013,7 @@ public: size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ; if ( threadIdx.y == 0 ) { - Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , shared ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared ); } if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } @@ -950,18 +1022,51 @@ public: } } + __device__ inline + void run(const DummyShflReductionType&) const + { + value_type value; + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value); + + // Iterate this block through the league + for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) { + this-> template exec_team< WorkTag > + ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin + , m_shmem_begin + , m_shmem_size + , m_scratch_ptr[1] + , m_scratch_size[1] + , league_rank + , m_league_size ) + , value ); + } + + pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ; + + value_type init; + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init); + if(Impl::cuda_inter_block_reduction<FunctorType,ValueJoin,WorkTag> + (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)) { + const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; + if(id==0) { + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); + *result = value; + } + } + } + inline void execute() { - const int block_count = std::min( m_league_size , m_team_size ); + const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) ) + :std::min( m_league_size , m_team_size ); - m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( m_functor ) * block_count ); + m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count ); m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) ); - m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( m_functor ) ); + m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) ); - // REQUIRED DIMENSIONS ( 1 , N , 1 ) - const dim3 block( 1 , m_team_size , 1 ); - const dim3 grid( std::min( int(m_league_size) , int(m_team_size) ) , 1 , 1 ); + const dim3 block( m_vector_size , m_team_size , 1 ); + const dim3 grid( block_count , 1 , 1 ); const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ; CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute @@ -970,11 +1075,11 @@ public: if ( m_result_ptr ) { if ( m_unified_space ) { - const int count = ValueTraits::value_count( m_functor ); + const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; } } else { - const int size = ValueTraits::value_size( m_functor ); + const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ); DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size ); } } @@ -984,8 +1089,11 @@ public: ParallelReduce( const FunctorType & arg_functor , const Policy & arg_policy , const HostViewType & arg_result - ) + , typename std::enable_if< + Kokkos::is_view< HostViewType >::value + ,void*>::type = NULL) : m_functor( arg_functor ) + , m_reducer( InvalidType() ) , m_result_ptr( arg_result.ptr_on_device() ) , m_scratch_space( 0 ) , m_scratch_flags( 0 ) @@ -993,39 +1101,107 @@ public: , m_team_begin( 0 ) , m_shmem_begin( 0 ) , m_shmem_size( 0 ) + , m_scratch_ptr{NULL,NULL} , m_league_size( arg_policy.league_size() ) , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() : - Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(), arg_policy.scratch_size() ) / arg_policy.vector_length() ) + Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(), + arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / + arg_policy.vector_length() ) + , m_vector_size( arg_policy.vector_length() ) + , m_scratch_size{arg_policy.scratch_size(0,m_team_size),arg_policy.scratch_size(1,m_team_size)} { // Return Init value if the number of worksets is zero if( arg_policy.league_size() == 0) { - ValueInit::init( m_functor , arg_result.ptr_on_device() ); + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , arg_result.ptr_on_device() ); return ; } - m_team_begin = cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size ); + m_team_begin = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size ); m_shmem_begin = sizeof(double) * ( m_team_size + 2 ); - m_shmem_size = arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size ); + m_shmem_size = arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size ); + m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size))); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = arg_policy.scratch_size(1,m_team_size); // The global parallel_reduce does not support vector_length other than 1 at the moment - if( arg_policy.vector_length() > 1) - Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA."); + if( (arg_policy.vector_length() > 1) && !UseShflReduction ) + Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA for dynamic sized reduction types."); - if( m_team_size < 32) - Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA."); + if( (m_team_size < 32) && !UseShflReduction ) + Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA for dynamic sized reduction types."); // Functor's reduce memory, team scan memory, and team shared memory depend upon team size. const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ; - if ( ! Kokkos::Impl::is_integral_power_of_two( m_team_size ) || - CudaTraits::SharedMemoryCapacity < shmem_size_total ) { + if (! Kokkos::Impl::is_integral_power_of_two( m_team_size ) && !UseShflReduction ) { Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size")); } + if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) { + Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory")); + } + if ( m_team_size > Kokkos::Impl::cuda_get_max_block_size< ParallelReduce > - ( arg_functor , arg_policy.vector_length(), arg_policy.scratch_size() ) / arg_policy.vector_length()) { + ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length()) { + Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size.")); + } + + } + + ParallelReduce( const FunctorType & arg_functor + , const Policy & arg_policy + , const ReducerType & reducer) + : m_functor( arg_functor ) + , m_reducer( reducer ) + , m_result_ptr( reducer.result_view().ptr_on_device() ) + , m_scratch_space( 0 ) + , m_scratch_flags( 0 ) + , m_unified_space( 0 ) + , m_team_begin( 0 ) + , m_shmem_begin( 0 ) + , m_shmem_size( 0 ) + , m_scratch_ptr{NULL,NULL} + , m_league_size( arg_policy.league_size() ) + , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() : + Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(), + arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / + arg_policy.vector_length() ) + , m_vector_size( arg_policy.vector_length() ) + { + // Return Init value if the number of worksets is zero + if( arg_policy.league_size() == 0) { + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr ); + return ; + } + + m_team_begin = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size ); + m_shmem_begin = sizeof(double) * ( m_team_size + 2 ); + m_shmem_size = arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size ); + m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size))); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = arg_policy.scratch_size(1,m_team_size); + + // The global parallel_reduce does not support vector_length other than 1 at the moment + if( (arg_policy.vector_length() > 1) && !UseShflReduction ) + Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA for dynamic sized reduction types."); + + if( (m_team_size < 32) && !UseShflReduction ) + Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA for dynamic sized reduction types."); + + // Functor's reduce memory, team scan memory, and team shared memory depend upon team size. + + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ; + + if ( (! Kokkos::Impl::is_integral_power_of_two( m_team_size ) && !UseShflReduction ) || + CudaTraits::SharedMemoryCapacity < shmem_size_total ) { + Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size")); + } + + if ( int(m_team_size) > + int(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce > + ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) { Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size.")); } @@ -1453,14 +1629,12 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >& loop_boundaries, const Lambda & lambda, ValueType& result) { #ifdef __CUDA_ARCH__ - ValueType val = ValueType(); + result = ValueType(); for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { - lambda(i,val); + lambda(i,result); } - result = val; - if (loop_boundaries.increment > 1) result += shfl_down(result, 1,loop_boundaries.increment); if (loop_boundaries.increment > 2) @@ -1659,6 +1833,11 @@ namespace Impl { //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator() f(i,val); } + __device__ inline + void operator() (typename ExecPolicy::member_type& i, ValueType& val) const { + //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator() + f(i,val); + } }; @@ -1692,11 +1871,22 @@ namespace Impl { enum {value = true}; }; + template< class FunctorType, class Enable = void> + struct ReduceFunctorHasShmemSize { + enum {value = false}; + }; + + template< class FunctorType> + struct ReduceFunctorHasShmemSize<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type > { + enum {value = true}; + }; + template< class FunctorType, bool Enable = ( FunctorDeclaresValueType<FunctorType,void>::value) || ( ReduceFunctorHasInit<FunctorType>::value ) || ( ReduceFunctorHasJoin<FunctorType>::value ) || - ( ReduceFunctorHasFinal<FunctorType>::value ) + ( ReduceFunctorHasFinal<FunctorType>::value ) || + ( ReduceFunctorHasShmemSize<FunctorType>::value ) > struct IsNonTrivialReduceFunctor { enum {value = false}; @@ -1717,376 +1907,18 @@ namespace Impl { typedef typename Kokkos::Impl::FunctorValueTraits< FunctorType ,Tag >::reference_type reference_type; }; -} - -// general policy and view ouput -template< class ExecPolicy , class FunctorTypeIn , class ViewType > -inline -void parallel_reduce( const ExecPolicy & policy - , const FunctorTypeIn & functor_in - , const ViewType & result_view - , const std::string& str = "" - , typename Impl::enable_if< - ( Kokkos::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value && - Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value - )>::type * = 0 ) -{ - enum {FunctorHasValueType = Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value }; - typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,typename ViewType::value_type> >::type FunctorType; - FunctorType functor = Impl::if_c<FunctorHasValueType,FunctorTypeIn,FunctorType>::select(functor_in,FunctorType(functor_in)); - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelScan(kpID); - } -#endif -} - -// general policy and pod or array of pod output -template< class ExecPolicy , class FunctorTypeIn , class ResultType> -inline -void parallel_reduce( const ExecPolicy & policy - , const FunctorTypeIn & functor_in - , ResultType& result_ref - , const std::string& str = "" - , typename Impl::enable_if< - ( ! Kokkos::is_view<ResultType>::value && - ! Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value && - ! Impl::is_integral< ExecPolicy >::value && - Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )>::type * = 0 ) -{ - typedef typename Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ResultType> FunctorType; - - typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ; - - // Wrap the result output request in a view to inform the implementation - // of the type and memory space. - - typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) - , typename ValueTraits::value_type - , typename ValueTraits::pointer_type - >::type value_type ; - Kokkos::View< value_type - , HostSpace - , Kokkos::MemoryUnmanaged - > - result_view( ValueOps::pointer( result_ref ) - , 1 - ); - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType, ExecPolicy > closure( FunctorType(functor_in) , policy , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelScan(kpID); - } -#endif -} - -// general policy and pod or array of pod output -template< class ExecPolicy , class FunctorType> -inline -void parallel_reduce( const ExecPolicy & policy - , const FunctorType & functor - , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type result_ref - , const std::string& str = "" - , typename Impl::enable_if< - ( Impl::IsNonTrivialReduceFunctor<FunctorType>::value && - ! Impl::is_integral< ExecPolicy >::value && - Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )>::type * = 0 ) -{ - typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ; - - // Wrap the result output request in a view to inform the implementation - // of the type and memory space. - - typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) - , typename ValueTraits::value_type - , typename ValueTraits::pointer_type - >::type value_type ; - - Kokkos::View< value_type - , HostSpace - , Kokkos::MemoryUnmanaged - > - result_view( ValueOps::pointer( result_ref ) - , ValueTraits::value_count( functor ) - ); - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelScan(kpID); - } -#endif -} - -// integral range policy and view ouput -template< class FunctorTypeIn , class ViewType > -inline -void parallel_reduce( const size_t work_count - , const FunctorTypeIn & functor_in - , const ViewType & result_view - , const std::string& str = "" - , typename Impl::enable_if<( Kokkos::is_view<ViewType>::value && - Impl::is_same< - typename Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space, - Kokkos::Cuda>::value - )>::type * = 0 ) -{ - enum {FunctorHasValueType = Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value }; - typedef typename - Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space - execution_space ; - - typedef RangePolicy< execution_space > ExecPolicy ; - - typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,typename ViewType::value_type> >::type FunctorType; - - FunctorType functor = Impl::if_c<FunctorHasValueType,FunctorTypeIn,FunctorType>::select(functor_in,FunctorType(functor_in)); - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelScan(kpID); - } -#endif - -} - -// integral range policy and pod or array of pod output -template< class FunctorTypeIn , class ResultType> -inline -void parallel_reduce( const size_t work_count - , const FunctorTypeIn & functor_in - , ResultType& result - , const std::string& str = "" - , typename Impl::enable_if< ! Kokkos::is_view<ResultType>::value && - ! Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value && - Impl::is_same< - typename Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space, - Kokkos::Cuda>::value >::type * = 0 ) -{ - typedef typename - Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space - execution_space ; - typedef Kokkos::RangePolicy< execution_space > ExecPolicy ; - - typedef Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ResultType> FunctorType; - - - typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; - typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ; - - - // Wrap the result output request in a view to inform the implementation - // of the type and memory space. - - typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) - , typename ValueTraits::value_type - , typename ValueTraits::pointer_type - >::type value_type ; - - Kokkos::View< value_type - , HostSpace - , Kokkos::MemoryUnmanaged - > - result_view( ValueOps::pointer( result ) - , 1 - ); - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType , ExecPolicy > closure( FunctorType(functor_in) , ExecPolicy(0,work_count) , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelScan(kpID); - } -#endif -} - -template< class FunctorType> -inline -void parallel_reduce( const size_t work_count - , const FunctorType & functor - , typename Kokkos::Impl::FunctorValueTraits< FunctorType , void >::reference_type result - , const std::string& str = "" - , typename Impl::enable_if< Impl::IsNonTrivialReduceFunctor<FunctorType>::value && - Impl::is_same< - typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space, - Kokkos::Cuda>::value >::type * = 0 ) -{ - - typedef typename - Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space - execution_space ; - typedef Kokkos::RangePolicy< execution_space > ExecPolicy ; - - - - typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; - typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ; - - - // Wrap the result output request in a view to inform the implementation - // of the type and memory space. - - typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) - , typename ValueTraits::value_type - , typename ValueTraits::pointer_type - >::type value_type ; - - Kokkos::View< value_type - , HostSpace - , Kokkos::MemoryUnmanaged - > - result_view( ValueOps::pointer( result ) - , ValueTraits::value_count( functor ) - ); - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType , ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelScan(kpID); - } -#endif -} - -#ifdef KOKKOS_HAVE_CUDA -template< class ExecPolicy , class FunctorType , class ResultType > -inline -void parallel_reduce( const std::string & str - , const ExecPolicy & policy - , const FunctorType & functor - , ResultType * result) -{ - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl; - #endif - - parallel_reduce(policy,functor,result,str); - - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl; - #endif - (void) str; -} - -template< class ExecPolicy , class FunctorType , class ResultType > -inline -void parallel_reduce( const std::string & str - , const ExecPolicy & policy - , const FunctorType & functor - , ResultType & result) -{ - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl; - #endif + template< class FunctorTypeIn, class ExecPolicy, class ValueType> + struct ParallelReduceFunctorType<FunctorTypeIn,ExecPolicy,ValueType,Cuda> { - parallel_reduce(policy,functor,result,str); + enum {FunctorHasValueType = IsNonTrivialReduceFunctor<FunctorTypeIn>::value }; + typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ValueType> >::type functor_type; + static functor_type functor(const FunctorTypeIn& functor_in) { + return Impl::if_c<FunctorHasValueType,FunctorTypeIn,functor_type>::select(functor_in,functor_type(functor_in)); + } + }; - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl; - #endif - (void) str; } -template< class ExecPolicy , class FunctorType > -inline -void parallel_reduce( const std::string & str - , const ExecPolicy & policy - , const FunctorType & functor) -{ - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl; - #endif - - parallel_reduce(policy,functor,str); - - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl; - #endif - (void) str; -} -#endif } // namespace Kokkos #endif /* defined( __CUDACC__ ) */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 11871a6abc..1778f631c0 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -130,16 +130,17 @@ inline void cuda_intra_block_reduction( ValueType& value, cuda_inter_warp_reduction(value,join,max_active_thread); } -template< class FunctorType , class JoinOp> +template< class FunctorType , class JoinOp , class ArgTag = void > __device__ -bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type value, +bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgTag >::reference_type value, + typename FunctorValueTraits< FunctorType , ArgTag >::reference_type neutral, const JoinOp& join, Cuda::size_type * const m_scratch_space, - typename FunctorValueTraits< FunctorType , void >::pointer_type const result, + typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result, Cuda::size_type * const m_scratch_flags, const int max_active_thread = blockDim.y) { - typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type; - typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type; + typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type; + typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type; //Do the intra-block reduction with shfl operations and static shared memory cuda_intra_block_reduction(value,join,max_active_thread); @@ -170,7 +171,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void if(id == 0) *m_scratch_flags = 0; last_block = true; - value = 0; + value = neutral; pointer_type const volatile global = (pointer_type) m_scratch_space ; @@ -366,7 +367,12 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor , size_type * const shared = shared_data + word_count.value * BlockSizeMask ; size_type * const global = global_data + word_count.value * block_id ; +#if (__CUDA_ARCH__ < 500) for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; } +#else + for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; } +#endif + } // Contributing blocks note that their contribution has been completed via an atomic-increment flag diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp new file mode 100644 index 0000000000..701d267e1b --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp @@ -0,0 +1,179 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) + +#include <impl/Kokkos_TaskQueue_impl.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template class TaskQueue< Kokkos::Cuda > ; + +//---------------------------------------------------------------------------- + +__device__ +void TaskQueueSpecialization< Kokkos::Cuda >::driver + ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue ) +{ + using Member = TaskExec< Kokkos::Cuda > ; + using Queue = TaskQueue< Kokkos::Cuda > ; + using task_root_type = TaskBase< Kokkos::Cuda , void , void > ; + + task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + + Member single_exec( 1 ); + Member team_exec( blockDim.y ); + + const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ; + + union { + task_root_type * ptr ; + int raw[2] ; + } task ; + + // Loop until all queues are empty and no tasks in flight + + do { + + // Each team lead attempts to acquire either a thread team task + // or collection of single thread tasks for the team. + + if ( 0 == warp_lane ) { + + task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ; + + // Loop by priority and then type + for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) { + for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) { + task.ptr = Queue::pop_task( & queue->m_ready[i][j] ); + } + } + +#if 0 +printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x + , uintptr_t(task.ptr)); +#endif + + } + + // shuffle broadcast + + task.raw[0] = __shfl( task.raw[0] , 0 ); + task.raw[1] = __shfl( task.raw[1] , 0 ); + + if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count + + if ( end != task.ptr ) { + if ( task_root_type::TaskTeam == task.ptr->m_task_type ) { + // Thread Team Task + (*task.ptr->m_apply)( task.ptr , & team_exec ); + } + else if ( 0 == threadIdx.y ) { + // Single Thread Task + (*task.ptr->m_apply)( task.ptr , & single_exec ); + } + + if ( 0 == warp_lane ) { + queue->complete( task.ptr ); + } + } + } while(1); +} + +namespace { + +__global__ +void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue ) +{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); } + +} + +void TaskQueueSpecialization< Kokkos::Cuda >::execute + ( TaskQueue< Kokkos::Cuda > * const queue ) +{ + const int warps_per_block = 4 ; + const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 ); + const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block ); + const int shared = 0 ; + const cudaStream_t stream = 0 ; + + CUDA_SAFE_CALL( cudaDeviceSynchronize() ); + +#if 0 +printf("cuda_task_queue_execute before\n"); +#endif + + // Query the stack size, in bytes: + // + // size_t stack_size = 0 ; + // CUDA_SAFE_CALL( cudaDeviceGetLimit( & stack_size , cudaLimitStackSize ) ); + // + // If not large enough then set the stack size, in bytes: + // + // CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) ); + + cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue ); + + CUDA_SAFE_CALL( cudaGetLastError() ); + + CUDA_SAFE_CALL( cudaDeviceSynchronize() ); + +#if 0 +printf("cuda_task_queue_execute after\n"); +#endif + +} + +}} /* namespace Kokkos::Impl */ + +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */ + + diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp new file mode 100644 index 0000000000..9d9347cc8d --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -0,0 +1,519 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_CUDA_TASK_HPP +#define KOKKOS_IMPL_CUDA_TASK_HPP + +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { +namespace { + +template< typename TaskType > +__global__ +void set_cuda_task_base_apply_function_pointer + ( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr ) +{ *ptr = TaskType::apply ; } + +} + +template<> +class TaskQueueSpecialization< Kokkos::Cuda > +{ +public: + + using execution_space = Kokkos::Cuda ; + using memory_space = Kokkos::CudaUVMSpace ; + using queue_type = TaskQueue< execution_space > ; + + static + void iff_single_thread_recursive_execute( queue_type * const ) {} + + __device__ + static void driver( queue_type * const ); + + static + void execute( queue_type * const ); + + template< typename FunctorType > + static + void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr ) + { + using TaskType = TaskBase< execution_space + , typename FunctorType::value_type + , FunctorType > ; + + CUDA_SAFE_CALL( cudaDeviceSynchronize() ); + + set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr); + + CUDA_SAFE_CALL( cudaGetLastError() ); + CUDA_SAFE_CALL( cudaDeviceSynchronize() ); + } +}; + +extern template class TaskQueue< Kokkos::Cuda > ; + +//---------------------------------------------------------------------------- +/**\brief Impl::TaskExec<Cuda> is the TaskPolicy<Cuda>::member_type + * passed to tasks running in a Cuda space. + * + * Cuda thread blocks for tasking are dimensioned: + * blockDim.x == vector length + * blockDim.y == team size + * blockDim.z == number of teams + * where + * blockDim.x * blockDim.y == WarpSize + * + * Both single thread and thread team tasks are run by a full Cuda warp. + * A single thread task is called by warp lane #0 and the remaining + * lanes of the warp are idle. + */ +template<> +class TaskExec< Kokkos::Cuda > +{ +private: + + TaskExec( TaskExec && ) = delete ; + TaskExec( TaskExec const & ) = delete ; + TaskExec & operator = ( TaskExec && ) = delete ; + TaskExec & operator = ( TaskExec const & ) = delete ; + + friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ; + friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ; + + const int m_team_size ; + + __device__ + TaskExec( int arg_team_size = blockDim.y ) + : m_team_size( arg_team_size ) {} + +public: + +#if defined( __CUDA_ARCH__ ) + __device__ void team_barrier() { /* __threadfence_block(); */ } + __device__ int team_rank() const { return threadIdx.y ; } + __device__ int team_size() const { return m_team_size ; } +#else + __host__ void team_barrier() {} + __host__ int team_rank() const { return 0 ; } + __host__ int team_size() const { return 0 ; } +#endif + +}; + +//---------------------------------------------------------------------------- + +template<typename iType> +struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > > +{ + typedef iType index_type; + const iType start ; + const iType end ; + const iType increment ; + const TaskExec< Kokkos::Cuda > & thread; + +#if defined( __CUDA_ARCH__ ) + + __device__ inline + TeamThreadRangeBoundariesStruct + ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count) + : start( threadIdx.y ) + , end(arg_count) + , increment( blockDim.y ) + , thread(arg_thread) + {} + + __device__ inline + TeamThreadRangeBoundariesStruct + ( const TaskExec< Kokkos::Cuda > & arg_thread + , const iType & arg_start + , const iType & arg_end + ) + : start( arg_start + threadIdx.y ) + , end( arg_end) + , increment( blockDim.y ) + , thread( arg_thread ) + {} + +#else + + TeamThreadRangeBoundariesStruct + ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count); + + TeamThreadRangeBoundariesStruct + ( const TaskExec< Kokkos::Cuda > & arg_thread + , const iType & arg_start + , const iType & arg_end + ); + +#endif + +}; + +//---------------------------------------------------------------------------- + +template<typename iType> +struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > > +{ + typedef iType index_type; + const iType start ; + const iType end ; + const iType increment ; + const TaskExec< Kokkos::Cuda > & thread; + +#if defined( __CUDA_ARCH__ ) + + __device__ inline + ThreadVectorRangeBoundariesStruct + ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count) + : start( threadIdx.x ) + , end(arg_count) + , increment( blockDim.x ) + , thread(arg_thread) + {} + +#else + + ThreadVectorRangeBoundariesStruct + ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count); + +#endif + +}; + +}} /* namespace Kokkos::Impl */ + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > > +TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread + , const iType & count ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count); +} + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > > +TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & start , const iType & end ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >(thread,start,end); +} + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > > +ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread + , const iType & count ) +{ + return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count); +} + +/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + * This functionality requires C++11 support. +*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for + ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries + , const Lambda& lambda + ) +{ + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i); + } +} + +// reduce across corresponding lanes between team members within warp +// assume stride*team_size == warp_size +template< typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void strided_shfl_warp_reduction + (const JoinType& join, + ValueType& val, + int team_size, + int stride) +{ + for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) { + join(val, Kokkos::shfl_down(val, lane_delta, team_size*stride)); + } +} + +// multiple within-warp non-strided reductions +template< typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void multi_shfl_warp_reduction + (const JoinType& join, + ValueType& val, + int vec_length) +{ + for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) { + join(val, Kokkos::shfl_down(val, lane_delta, vec_length)); + } +} + +// broadcast within warp +template< class ValueType > +KOKKOS_INLINE_FUNCTION +ValueType shfl_warp_broadcast + (ValueType& val, + int src_lane, + int width) +{ + return Kokkos::shfl(val, src_lane, width); +} + +// all-reduce across corresponding vector lanes between team members within warp +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries, + const Lambda & lambda, + const JoinType& join, + ValueType& initialized_result) { + + ValueType result = initialized_result; + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,result); + } + initialized_result = result; + + strided_shfl_warp_reduction<ValueType, JoinType>( + join, + initialized_result, + loop_boundaries.thread.team_size(), + blockDim.x); + initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize ); +} + +// all-reduce across corresponding vector lanes between team members within warp +// if no join() provided, use sum +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries, + const Lambda & lambda, + ValueType& initialized_result) { + + //TODO what is the point of creating this temporary? + ValueType result = initialized_result; + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,result); + } + initialized_result = result; + + strided_shfl_warp_reduction( + [&] (ValueType& val1, const ValueType& val2) { val1 += val2; }, + initialized_result, + loop_boundaries.thread.team_size(), + blockDim.x); + initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize ); +} + +// all-reduce within team members within warp +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries, + const Lambda & lambda, + const JoinType& join, + ValueType& initialized_result) { + + ValueType result = initialized_result; + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,result); + } + initialized_result = result; + + multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x); + initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x ); +} + +// all-reduce within team members within warp +// if no join() provided, use sum +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries, + const Lambda & lambda, + ValueType& initialized_result) { + + ValueType result = initialized_result; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,result); + } + + initialized_result = result; + + //initialized_result = multi_shfl_warp_reduction( + multi_shfl_warp_reduction( + [&] (ValueType& val1, const ValueType& val2) { val1 += val2; }, + initialized_result, + blockDim.x); + initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x ); +} + +// scan across corresponding vector lanes between team members within warp +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +template< typename ValueType, typename iType, class Lambda > +KOKKOS_INLINE_FUNCTION +void parallel_scan + (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries, + const Lambda & lambda) { + + ValueType accum = 0 ; + ValueType val, y, local_total; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + val = 0; + lambda(i,val,false); + + // intra-blockDim.y exclusive scan on 'val' + // accum = accumulated, sum in total for this iteration + + // INCLUSIVE scan + for( int offset = blockDim.x ; offset < Impl::CudaTraits::WarpSize ; offset <<= 1 ) { + y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize); + if(threadIdx.y*blockDim.x >= offset) { val += y; } + } + + // pass accum to all threads + local_total = shfl_warp_broadcast<ValueType>(val, + threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x, + Impl::CudaTraits::WarpSize); + + // make EXCLUSIVE scan by shifting values over one + val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize); + if ( threadIdx.y == 0 ) { val = 0 ; } + + val += accum; + lambda(i,val,true); + accum += local_total; + } +} + +// scan within team member (vector) within warp +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_scan + (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries, + const Lambda & lambda) +{ + ValueType accum = 0 ; + ValueType val, y, local_total; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + val = 0; + lambda(i,val,false); + + // intra-blockDim.x exclusive scan on 'val' + // accum = accumulated, sum in total for this iteration + + // INCLUSIVE scan + for( int offset = 1 ; offset < blockDim.x ; offset <<= 1 ) { + y = Kokkos::shfl_up(val, offset, blockDim.x); + if(threadIdx.x >= offset) { val += y; } + } + + // pass accum to all threads + local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x); + + // make EXCLUSIVE scan by shifting values over one + val = Kokkos::shfl_up(val, 1, blockDim.x); + if ( threadIdx.x == 0 ) { val = 0 ; } + + val += accum; + lambda(i,val,true); + accum += local_total; + } +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ +#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */ + diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp index f470a0a6ef..bb3cd2640d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp @@ -46,9 +46,10 @@ #include <stdio.h> #include <iostream> #include <sstream> +#include <Kokkos_Core.hpp> #include <Cuda/Kokkos_Cuda_TaskPolicy.hpp> -#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) +#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) // #define DETAILED_PRINT @@ -93,9 +94,8 @@ CudaTaskPolicyQueue , const unsigned arg_team_size ) : m_space( Kokkos::CudaUVMSpace() - , arg_task_max_size - , arg_task_max_size * arg_task_max_count - , 1 /* only one level of memory pool */ + , arg_task_max_size * arg_task_max_count * 1.2 + , 16 /* log2(superblock size) */ ) , m_team { 0 , 0 , 0 } , m_serial { 0 , 0 , 0 } @@ -172,6 +172,8 @@ if ( IS_TEAM_LEAD && 0 != team_task ) { member( kokkos_impl_cuda_shared_memory<void>() , 16 /* shared_begin */ , team_task->m_shmem_size /* shared size */ + , 0 /* scratch level 1 pointer */ + , 0 /* scratch level 1 size */ , 0 /* league rank */ , 1 /* league size */ ); @@ -926,5 +928,5 @@ void Task::clear_dependence() } /* namespace Kokkos */ -#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */ +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp index 1b645c8819..e71512f039 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp @@ -47,19 +47,11 @@ #define KOKKOS_CUDA_TASKPOLICY_HPP #include <Kokkos_Core_fwd.hpp> - -#if defined( KOKKOS_HAVE_CUDA ) && \ - defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) - -#define KOKKOS_ENABLE_CUDA_TASK_POLICY - -/* The TaskPolicy< Cuda > capability requires nvcc using the option: - * --relocatable-device-code=true - */ - #include <Kokkos_Cuda.hpp> #include <Kokkos_TaskPolicy.hpp> +#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) + //---------------------------------------------------------------------------- namespace Kokkos { @@ -81,8 +73,6 @@ public: private: - friend struct CudaTaskPolicyQueue ; - CudaTaskPolicyQueue * m_policy ; TaskMember * volatile * m_queue ; function_team_type m_team ; ///< Apply function on CUDA @@ -819,9 +809,11 @@ public: static member_type member_single() { return - member_type( 0 /* shared memory */ - , 0 /* shared memory begin */ - , 0 /* shared memory size */ + member_type( 0 /* shared memory pointer */ + , 0 /* shared memory begin offset */ + , 0 /* shared memory end offset */ + , 0 /* scratch level_1 pointer */ + , 0 /* scratch level_1 size */ , 0 /* league rank */ , 1 /* league size */ ); } @@ -832,10 +824,10 @@ public: } /* namespace Experimental */ } /* namespace Kokkos */ -#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) */ //---------------------------------------------------------------------------- +#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */ #endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp index 84c2e75dc2..92f6fc1f5f 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp @@ -56,8 +56,6 @@ #include <impl/Kokkos_Shape.hpp> #include <Kokkos_View.hpp> -#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp> - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -90,343 +88,6 @@ struct AssertShapeBoundsAbort< CudaSpace > //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#if ! KOKKOS_USING_EXP_VIEW - -namespace Kokkos { -namespace Impl { - -//---------------------------------------------------------------------------- -// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4) -// Via reinterpret_case this can be used to support all scalar types of those sizes. -// Any other scalar type falls back to either normal reads out of global memory, -// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0) - -template< typename ValueType - , class MemorySpace - , class AliasType = - typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 4 ) , int , - typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 8 ) , ::int2 , - typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , ::int4 , - typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 32 ) , ::float4 ,void - >::type - >::type - >::type - >::type - > -class CudaTextureFetch { -private: - - cuda_texture_object_type m_obj ; - const ValueType * m_alloc_ptr ; - int m_offset ; - - void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker ) - { - typedef char const * const byte; - - m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr()); - - size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr); - const bool ok_aligned = 0 == byte_offset % sizeof(ValueType); - - const size_t count = tracker.alloc_size() / sizeof(ValueType); - const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count)); - - if (ok_aligned && ok_contains) { - if (tracker.attribute() == NULL ) { - MemorySpace::texture_object_attach( - tracker - , sizeof(ValueType) - , cudaCreateChannelDesc< AliasType >() - ); - } - m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj; - m_offset = arg_ptr - m_alloc_ptr; - } - else if( !ok_contains ) { - throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer."); - } - else { - throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer."); - } - } - -public: - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {} - - KOKKOS_INLINE_FUNCTION - ~CudaTextureFetch() {} - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch( const CudaTextureFetch & rhs ) - : m_obj( rhs.m_obj ) - , m_alloc_ptr( rhs.m_alloc_ptr ) - , m_offset( rhs.m_offset ) - {} - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) - { - m_obj = rhs.m_obj ; - m_alloc_ptr = rhs.m_alloc_ptr ; - m_offset = rhs.m_offset ; - return *this ; - } - - KOKKOS_INLINE_FUNCTION explicit - CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker ) - : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0) - { - #if defined( KOKKOS_USE_LDG_INTRINSIC ) - m_alloc_ptr(arg_ptr); - #elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ ) - if ( arg_ptr != NULL ) { - if ( tracker.is_valid() ) { - attach( arg_ptr, tracker ); - } - else { - AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr); - if ( found_tracker.is_valid() ) { - attach( arg_ptr, found_tracker ); - } else { - throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!"); - } - } - } - #endif - } - - KOKKOS_INLINE_FUNCTION - operator const ValueType * () const { return m_alloc_ptr + m_offset ; } - - - template< typename iType > - KOKKOS_INLINE_FUNCTION - ValueType operator[]( const iType & i ) const - { - #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) - AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i])); - return *(reinterpret_cast<ValueType*> (&v)); - #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) - AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset ); - return *(reinterpret_cast<ValueType*> (&v)); - #else - return m_alloc_ptr[ i + m_offset ]; - #endif - } -}; - - -template< typename ValueType, class MemorySpace > -class CudaTextureFetch< const ValueType, MemorySpace, float4 > { -private: - typedef float4 AliasType; - cuda_texture_object_type m_obj ; - const ValueType * m_alloc_ptr ; - int m_offset ; - - void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker ) - { - typedef char const * const byte; - - m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr()); - - size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr); - const bool ok_aligned = 0 == byte_offset % sizeof(ValueType); - - const size_t count = tracker.alloc_size() / sizeof(ValueType); - const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count)); - - if (ok_aligned && ok_contains) { - if (tracker.attribute() == NULL ) { - MemorySpace::texture_object_attach( - tracker - , sizeof(ValueType) - , cudaCreateChannelDesc< AliasType >() - ); - } - m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj; - m_offset = arg_ptr - m_alloc_ptr; - } - else if( !ok_contains ) { - throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer."); - } - else { - throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer."); - } - } - -public: - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {} - - KOKKOS_INLINE_FUNCTION - ~CudaTextureFetch() {} - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch( const CudaTextureFetch & rhs ) - : m_obj( rhs.m_obj ) - , m_alloc_ptr( rhs.m_alloc_ptr ) - , m_offset( rhs.m_offset ) - {} - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) - { - m_obj = rhs.m_obj ; - m_alloc_ptr = rhs.m_alloc_ptr ; - m_offset = rhs.m_offset ; - return *this ; - } - - KOKKOS_INLINE_FUNCTION explicit - CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker ) - : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0) - { - #if defined( KOKKOS_USE_LDG_INTRINSIC ) - m_alloc_ptr(arg_ptr); - #elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ ) - if ( arg_ptr != NULL ) { - if ( tracker.is_valid() ) { - attach( arg_ptr, tracker ); - } - else { - AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr); - if ( found_tracker.is_valid() ) { - attach( arg_ptr, found_tracker ); - } else { - throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!"); - } - } - } - #endif - } - - KOKKOS_INLINE_FUNCTION - operator const ValueType * () const { return m_alloc_ptr + m_offset ; } - - - template< typename iType > - KOKKOS_INLINE_FUNCTION - ValueType operator[]( const iType & i ) const - { - #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) - AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i])); - return *(reinterpret_cast<ValueType*> (&v)); - #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) - union Float4ValueType { - float4 f4[2]; - ValueType val; - }; - Float4ValueType convert; - convert.f4[0] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset) ); - convert.f4[1] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset)+1 ); - return convert.val; - #else - return m_alloc_ptr[ i + m_offset ]; - #endif - } -}; - -template< typename ValueType, class MemorySpace > -class CudaTextureFetch< const ValueType, MemorySpace, void > -{ -private: - const ValueType * m_ptr ; -public: - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch() : m_ptr(0) {}; - - KOKKOS_INLINE_FUNCTION - ~CudaTextureFetch() { - } - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch( const ValueType * ptr, const AllocationTracker & ) : m_ptr(ptr) {} - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {} - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) { - m_ptr = rhs.m_ptr; - return *this ; - } - - explicit KOKKOS_INLINE_FUNCTION - CudaTextureFetch( ValueType * const base_view_ptr, AllocationTracker const & /*tracker*/ ) { - m_ptr = base_view_ptr; - } - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch & operator = (const ValueType* base_view_ptr) { - m_ptr = base_view_ptr; - return *this; - } - - - KOKKOS_INLINE_FUNCTION - operator const ValueType * () const { return m_ptr ; } - - - template< typename iType > - KOKKOS_INLINE_FUNCTION - ValueType operator[]( const iType & i ) const - { - return m_ptr[ i ]; - } -}; - -} // namespace Impl -} // namespace Kokkos - - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization - * if 'const' value type, CudaSpace and random access. - */ -template< class ViewTraits > -class ViewDataHandle< ViewTraits , - typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value || - is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value ) - && - is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value - && - ViewTraits::memory_traits::RandomAccess - >::type > -{ -public: - enum { ReturnTypeIsReference = false }; - - typedef Impl::CudaTextureFetch< typename ViewTraits::value_type - , typename ViewTraits::memory_space> handle_type; - - KOKKOS_INLINE_FUNCTION - static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & arg_tracker ) - { - return handle_type(arg_data_ptr, arg_tracker); - } - - typedef typename ViewTraits::value_type return_type; -}; - -} -} - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - #endif // KOKKOS_HAVE_CUDA #endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */ diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp new file mode 100644 index 0000000000..e813285fc7 --- /dev/null +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -0,0 +1,611 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP +#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP + +#include <Kokkos_ExecPolicy.hpp> +#include <Kokkos_Parallel.hpp> +#include <initializer_list> + +#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__) +#define KOKKOS_MDRANGE_IVDEP +#endif + +namespace Kokkos { namespace Experimental { + +enum class Iterate +{ + Default, // Default for the device + Left, // Left indices stride fastest + Right, // Right indices stride fastest + Flat, // Do not tile, only valid for inner direction +}; + +template <typename ExecSpace> +struct default_outer_direction +{ + using type = Iterate; + static constexpr Iterate value = Iterate::Right; +}; + +template <typename ExecSpace> +struct default_inner_direction +{ + using type = Iterate; + static constexpr Iterate value = Iterate::Right; +}; + + +// Iteration Pattern +template < unsigned N + , Iterate OuterDir = Iterate::Default + , Iterate InnerDir = Iterate::Default + > +struct Rank +{ + static_assert( N != 0u, "Kokkos Error: rank 0 undefined"); + static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range"); + static_assert( N < 4u, "Kokkos Error: Unsupported rank..."); + + using iteration_pattern = Rank<N, OuterDir, InnerDir>; + + static constexpr int rank = N; + static constexpr Iterate outer_direction = OuterDir; + static constexpr Iterate inner_direction = InnerDir; +}; + + + +// multi-dimensional iteration pattern +template <typename... Properties> +struct MDRangePolicy +{ + using range_policy = RangePolicy<Properties...>; + + static_assert( !std::is_same<range_policy,void>::value + , "Kokkos Error: MD iteration pattern not defined" ); + + using iteration_pattern = typename range_policy::iteration_pattern; + using work_tag = typename range_policy::work_tag; + + static constexpr int rank = iteration_pattern::rank; + + static constexpr int outer_direction = static_cast<int> ( + (iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat) + ? iteration_pattern::outer_direction + : default_outer_direction< typename range_policy::execution_space>::value ); + + static constexpr int inner_direction = static_cast<int> ( + iteration_pattern::inner_direction != Iterate::Default + ? iteration_pattern::inner_direction + : default_inner_direction< typename range_policy::execution_space>::value ) ; + + + // Ugly ugly workaround intel 14 not handling scoped enum correctly + static constexpr int Flat = static_cast<int>( Iterate::Flat ); + static constexpr int Right = static_cast<int>( Iterate::Right ); + + + using size_type = typename range_policy::index_type; + using index_type = typename std::make_signed<size_type>::type; + + + template <typename I> + MDRangePolicy( std::initializer_list<I> upper_corner ) + { + static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" ); + + // TODO check size of lists equal to rank + // static_asserts on initializer_list.size() require c++14 + + //static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" ); + + const auto u = upper_corner.begin(); + + m_num_tiles = 1; + for (int i=0; i<rank; ++i) { + m_offset[i] = static_cast<index_type>(0); + m_dim[i] = static_cast<index_type>(u[i]); + if (inner_direction != Flat) { + // default tile size to 4 + m_tile[i] = 4; + } else { + m_tile[i] = 1; + } + m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i]; + m_num_tiles *= m_tile_dim[i]; + } + } + + template <typename IA, typename IB> + MDRangePolicy( std::initializer_list<IA> corner_a + , std::initializer_list<IB> corner_b + ) + { + static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" ); + static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" ); + + // TODO check size of lists equal to rank + // static_asserts on initializer_list.size() require c++14 + //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" ); + //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" ); + + + using A = typename std::make_signed<IA>::type; + using B = typename std::make_signed<IB>::type; + + const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); }; + const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); }; + + m_num_tiles = 1; + for (int i=0; i<rank; ++i) { + m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i)); + m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i)); + if (inner_direction != Flat) { + // default tile size to 4 + m_tile[i] = 4; + } else { + m_tile[i] = 1; + } + m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i]; + m_num_tiles *= m_tile_dim[i]; + } + } + + template <typename IA, typename IB, typename T> + MDRangePolicy( std::initializer_list<IA> corner_a + , std::initializer_list<IB> corner_b + , std::initializer_list<T> tile + ) + { + static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" ); + static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" ); + static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" ); + static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" ); + + // TODO check size of lists equal to rank + // static_asserts on initializer_list.size() require c++14 + //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" ); + //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" ); + //static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" ); + + using A = typename std::make_signed<IA>::type; + using B = typename std::make_signed<IB>::type; + + const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); }; + const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); }; + const auto t = tile.begin(); + + m_num_tiles = 1; + for (int i=0; i<rank; ++i) { + m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i)); + m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i)); + m_tile[i] = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 ); + m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i]; + m_num_tiles *= m_tile_dim[i]; + } + } + + index_type m_offset[rank]; + index_type m_dim[rank]; + int m_tile[rank]; + index_type m_tile_dim[rank]; + size_type m_num_tiles; // product of tile dims +}; + +namespace Impl { + +// Serial, Threads, OpenMP +// use enable_if to overload for Cuda +template < typename MDRange, typename Functor, typename Enable = void > +struct MDForFunctor +{ + using work_tag = typename MDRange::work_tag; + using index_type = typename MDRange::index_type; + using size_type = typename MDRange::size_type; + + MDRange m_range; + Functor m_func; + + KOKKOS_INLINE_FUNCTION + MDForFunctor( MDRange const& range, Functor const& f ) + : m_range(range) + , m_func( f ) + {} + + KOKKOS_INLINE_FUNCTION + MDForFunctor( MDRange const& range, Functor && f ) + : m_range(range) + , m_func( std::forward<Functor>(f) ) + {} + + KOKKOS_INLINE_FUNCTION + MDForFunctor( MDRange && range, Functor const& f ) + : m_range( std::forward<MDRange>(range) ) + , m_func( f ) + {} + + KOKKOS_INLINE_FUNCTION + MDForFunctor( MDRange && range, Functor && f ) + : m_range( std::forward<MDRange>(range) ) + , m_func( std::forward<Functor>(f) ) + {} + + + KOKKOS_INLINE_FUNCTION + MDForFunctor( MDForFunctor const& ) = default; + + KOKKOS_INLINE_FUNCTION + MDForFunctor& operator=( MDForFunctor const& ) = default; + + KOKKOS_INLINE_FUNCTION + MDForFunctor( MDForFunctor && ) = default; + + KOKKOS_INLINE_FUNCTION + MDForFunctor& operator=( MDForFunctor && ) = default; + + // Rank-2, Flat, No Tag + template <typename Idx> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<Idx>::value + && std::is_same<void, work_tag>::value + && MDRange::rank == 2 + && MDRange::inner_direction == MDRange::Flat + )>::type + operator()(Idx t) const + { + if ( MDRange::outer_direction == MDRange::Right ) { + m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] ) + , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) ); + } else { + m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] ) + , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) ); + } + } + + // Rank-2, Flat, Tag + template <typename Idx> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<Idx>::value + && !std::is_same<void, work_tag>::value + && MDRange::rank == 2 + && MDRange::inner_direction == MDRange::Flat + )>::type + operator()(Idx t) const + { + if ( MDRange::outer_direction == MDRange::Right ) { + m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] ) + , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) ); + } else { + m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] ) + , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) ); + } + } + + // Rank-2, Not Flat, No Tag + template <typename Idx> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<Idx>::value + && std::is_same<void, work_tag>::value + && MDRange::rank == 2 + && MDRange::inner_direction != MDRange::Flat + )>::type + operator()(Idx t) const + { + index_type t0, t1; + if ( MDRange::outer_direction == MDRange::Right ) { + t0 = t / m_range.m_tile_dim[1]; + t1 = t % m_range.m_tile_dim[1]; + } else { + t0 = t % m_range.m_tile_dim[0]; + t1 = t / m_range.m_tile_dim[0]; + } + + const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0]; + const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1]; + + const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] ); + const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] ); + + if ( MDRange::inner_direction == MDRange::Right ) { + for (int i0=b0; i0<e0; ++i0) { + #if defined(KOKKOS_MDRANGE_IVDEP) + #pragma ivdep + #endif + for (int i1=b1; i1<e1; ++i1) { + m_func( i0, i1 ); + }} + } else { + for (int i1=b1; i1<e1; ++i1) { + #if defined(KOKKOS_MDRANGE_IVDEP) + #pragma ivdep + #endif + for (int i0=b0; i0<e0; ++i0) { + m_func( i0, i1 ); + }} + } + } + + // Rank-2, Not Flat, Tag + template <typename Idx> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<Idx>::value + && !std::is_same<void, work_tag>::value + && MDRange::rank == 2 + && MDRange::inner_direction != MDRange::Flat + )>::type + operator()(Idx t) const + { + work_tag tag; + + index_type t0, t1; + if ( MDRange::outer_direction == MDRange::Right ) { + t0 = t / m_range.m_tile_dim[1]; + t1 = t % m_range.m_tile_dim[1]; + } else { + t0 = t % m_range.m_tile_dim[0]; + t1 = t / m_range.m_tile_dim[0]; + } + + const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0]; + const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1]; + + const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] ); + const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] ); + + if ( MDRange::inner_direction == MDRange::Right ) { + for (int i0=b0; i0<e0; ++i0) { + #if defined(KOKKOS_MDRANGE_IVDEP) + #pragma ivdep + #endif + for (int i1=b1; i1<e1; ++i1) { + m_func( tag, i0, i1 ); + }} + } else { + for (int i1=b1; i1<e1; ++i1) { + #if defined(KOKKOS_MDRANGE_IVDEP) + #pragma ivdep + #endif + for (int i0=b0; i0<e0; ++i0) { + m_func( tag, i0, i1 ); + }} + } + } + + //--------------------------------------------------------------------------- + + // Rank-3, Flat, No Tag + template <typename Idx> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<Idx>::value + && std::is_same<void, work_tag>::value + && MDRange::rank == 3 + && MDRange::inner_direction == MDRange::Flat + )>::type + operator()(Idx t) const + { + if ( MDRange::outer_direction == MDRange::Right ) { + const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2]; + m_func( m_range.m_offset[0] + ( t / tmp_prod ) + , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] ) + , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] ) + ); + } else { + const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1]; + m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] ) + , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] ) + , m_range.m_offset[2] + ( t / tmp_prod ) + ); + } + } + + // Rank-3, Flat, Tag + template <typename Idx> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<Idx>::value + && !std::is_same<void, work_tag>::value + && MDRange::rank == 3 + && MDRange::inner_direction == MDRange::Flat + )>::type + operator()(Idx t) const + { + if ( MDRange::outer_direction == MDRange::Right ) { + const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2]; + m_func( work_tag{} + , m_range.m_offset[0] + ( t / tmp_prod ) + , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] ) + , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] ) + ); + } else { + const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1]; + m_func( work_tag{} + , m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] ) + , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] ) + , m_range.m_offset[2] + ( t / tmp_prod ) + ); + } + } + + // Rank-3, Not Flat, No Tag + template <typename Idx> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<Idx>::value + && std::is_same<void, work_tag>::value + && MDRange::rank == 3 + && MDRange::inner_direction != MDRange::Flat + )>::type + operator()(Idx t) const + { + index_type t0, t1, t2; + if ( MDRange::outer_direction == MDRange::Right ) { + const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]); + t0 = t / tmp_prod; + t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2]; + t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2]; + } else { + const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]); + t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0]; + t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0]; + t2 = t / tmp_prod; + } + + const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0]; + const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1]; + const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2]; + + const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] ); + const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] ); + const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] ); + + if ( MDRange::inner_direction == MDRange::Right ) { + for (int i0=b0; i0<e0; ++i0) { + for (int i1=b1; i1<e1; ++i1) { + #if defined(KOKKOS_MDRANGE_IVDEP) + #pragma ivdep + #endif + for (int i2=b2; i2<e2; ++i2) { + m_func( i0, i1, i2 ); + }}} + } else { + for (int i2=b2; i2<e2; ++i2) { + for (int i1=b1; i1<e1; ++i1) { + #if defined(KOKKOS_MDRANGE_IVDEP) + #pragma ivdep + #endif + for (int i0=b0; i0<e0; ++i0) { + m_func( i0, i1, i2 ); + }}} + } + } + + // Rank-3, Not Flat, Tag + template <typename Idx> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<Idx>::value + && !std::is_same<void, work_tag>::value + && MDRange::rank == 3 + && MDRange::inner_direction != MDRange::Flat + )>::type + operator()(Idx t) const + { + work_tag tag; + + index_type t0, t1, t2; + if ( MDRange::outer_direction == MDRange::Right ) { + const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]); + t0 = t / tmp_prod; + t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2]; + t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2]; + } else { + const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]); + t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0]; + t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0]; + t2 = t / tmp_prod; + } + + const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0]; + const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1]; + const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2]; + + const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] ); + const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] ); + const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] ); + + if ( MDRange::inner_direction == MDRange::Right ) { + for (int i0=b0; i0<e0; ++i0) { + for (int i1=b1; i1<e1; ++i1) { + #if defined(KOKKOS_MDRANGE_IVDEP) + #pragma ivdep + #endif + for (int i2=b2; i2<e2; ++i2) { + m_func( tag, i0, i1, i2 ); + }}} + } else { + for (int i2=b2; i2<e2; ++i2) { + for (int i1=b1; i1<e1; ++i1) { + #if defined(KOKKOS_MDRANGE_IVDEP) + #pragma ivdep + #endif + for (int i0=b0; i0<e0; ++i0) { + m_func( tag, i0, i1, i2 ); + }}} + } + } +}; + + + +} // namespace Impl + + +template <typename MDRange, typename Functor> +void md_parallel_for( MDRange const& range + , Functor const& f + , const std::string& str = "" + ) +{ + Impl::MDForFunctor<MDRange, Functor> g(range, f); + + using range_policy = typename MDRange::range_policy; + + Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str ); +} + +template <typename MDRange, typename Functor> +void md_parallel_for( const std::string& str + , MDRange const& range + , Functor const& f + ) +{ + Impl::MDForFunctor<MDRange, Functor> g(range, f); + + using range_policy = typename MDRange::range_policy; + + Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str ); +} + +}} // namespace Kokkos::Experimental + +#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP + diff --git a/lib/kokkos/core/src/KokkosExp_View.hpp b/lib/kokkos/core/src/KokkosExp_View.hpp deleted file mode 100644 index f62d318f2e..0000000000 --- a/lib/kokkos/core/src/KokkosExp_View.hpp +++ /dev/null @@ -1,2306 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_EXP_VIEW_HPP -#define KOKKOS_EXP_VIEW_HPP - -#include <string> -#include <algorithm> -#include <type_traits> -#include <initializer_list> - -#include <Kokkos_Core_fwd.hpp> -#include <Kokkos_HostSpace.hpp> -#include <Kokkos_MemoryTraits.hpp> -#include <Kokkos_ExecPolicy.hpp> - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -template< class DstMemorySpace , class SrcMemorySpace > -struct DeepCopy ; - -template< class DataType > -struct ViewArrayAnalysis ; - -template< class DataType , class ArrayLayout - , typename ValueType = - typename ViewArrayAnalysis< DataType >::non_const_value_type - > -struct ViewDataAnalysis ; - -template< class , class ... > -class ViewMapping { public: enum { is_assignable = false }; }; - -template< class MemorySpace > -struct ViewOperatorBoundsErrorAbort ; - -template<> -struct ViewOperatorBoundsErrorAbort< Kokkos::HostSpace > { - static void apply( const size_t rank - , const size_t n0 , const size_t n1 - , const size_t n2 , const size_t n3 - , const size_t n4 , const size_t n5 - , const size_t n6 , const size_t n7 - , const size_t i0 , const size_t i1 - , const size_t i2 , const size_t i3 - , const size_t i4 , const size_t i5 - , const size_t i6 , const size_t i7 ); -}; - -} /* namespace Impl */ -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { - -/** \class ViewTraits - * \brief Traits class for accessing attributes of a View. - * - * This is an implementation detail of View. It is only of interest - * to developers implementing a new specialization of View. - * - * Template argument options: - * - View< DataType > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , ArrayLayout > - * - View< DataType , ArrayLayout , Space > - * - View< DataType , ArrayLayout , MemoryTraits > - * - View< DataType , ArrayLayout , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - */ - -template< class DataType , class ... Properties > -struct ViewTraits ; - -template<> -struct ViewTraits< void > -{ - typedef void execution_space ; - typedef void memory_space ; - typedef void HostMirrorSpace ; - typedef void array_layout ; - typedef void memory_traits ; -}; - -template< class ... Prop > -struct ViewTraits< void , void , Prop ... > -{ - // Ignore an extraneous 'void' - typedef typename ViewTraits<void,Prop...>::execution_space execution_space ; - typedef typename ViewTraits<void,Prop...>::memory_space memory_space ; - typedef typename ViewTraits<void,Prop...>::HostMirrorSpace HostMirrorSpace ; - typedef typename ViewTraits<void,Prop...>::array_layout array_layout ; - typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ; -}; - -template< class ArrayLayout , class ... Prop > -struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_array_layout<ArrayLayout>::value >::type , ArrayLayout , Prop ... > -{ - // Specify layout, keep subsequent space and memory traits arguments - - typedef typename ViewTraits<void,Prop...>::execution_space execution_space ; - typedef typename ViewTraits<void,Prop...>::memory_space memory_space ; - typedef typename ViewTraits<void,Prop...>::HostMirrorSpace HostMirrorSpace ; - typedef ArrayLayout array_layout ; - typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ; -}; - -template< class Space , class ... Prop > -struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_space<Space>::value >::type , Space , Prop ... > -{ - // Specify Space, memory traits should be the only subsequent argument. - - static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value || - std::is_same< typename ViewTraits<void,Prop...>::memory_space , void >::value || - std::is_same< typename ViewTraits<void,Prop...>::HostMirrorSpace , void >::value || - std::is_same< typename ViewTraits<void,Prop...>::array_layout , void >::value - , "Only one View Execution or Memory Space template argument" ); - - typedef typename Space::execution_space execution_space ; - typedef typename Space::memory_space memory_space ; - typedef typename Kokkos::Impl::is_space< Space >::host_mirror_space - HostMirrorSpace ; - typedef typename execution_space::array_layout array_layout ; - typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ; -}; - -template< class MemoryTraits , class ... Prop > -struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_memory_traits<MemoryTraits>::value >::type , MemoryTraits , Prop ... > -{ - // Specify memory trait, should not be any subsequent arguments - - static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value || - std::is_same< typename ViewTraits<void,Prop...>::memory_space , void >::value || - std::is_same< typename ViewTraits<void,Prop...>::array_layout , void >::value || - std::is_same< typename ViewTraits<void,Prop...>::memory_traits , void >::value - , "MemoryTrait is the final optional template argument for a View" ); - - typedef void execution_space ; - typedef void memory_space ; - typedef void HostMirrorSpace ; - typedef void array_layout ; - typedef MemoryTraits memory_traits ; -}; - - -template< class DataType , class ... Properties > -struct ViewTraits { -private: - - // Unpack the properties arguments - typedef ViewTraits< void , Properties ... > prop ; - - typedef typename - std::conditional< ! std::is_same< typename prop::execution_space , void >::value - , typename prop::execution_space - , Kokkos::DefaultExecutionSpace - >::type - ExecutionSpace ; - - typedef typename - std::conditional< ! std::is_same< typename prop::memory_space , void >::value - , typename prop::memory_space - , typename ExecutionSpace::memory_space - >::type - MemorySpace ; - - typedef typename - std::conditional< ! std::is_same< typename prop::array_layout , void >::value - , typename prop::array_layout - , typename ExecutionSpace::array_layout - >::type - ArrayLayout ; - - typedef typename - std::conditional - < ! std::is_same< typename prop::HostMirrorSpace , void >::value - , typename prop::HostMirrorSpace - , typename Kokkos::Impl::is_space< ExecutionSpace >::host_mirror_space - >::type - HostMirrorSpace ; - - typedef typename - std::conditional< ! std::is_same< typename prop::memory_traits , void >::value - , typename prop::memory_traits - , typename Kokkos::MemoryManaged - >::type - MemoryTraits ; - - // Analyze data type's properties, - // May be specialized based upon the layout and value type - typedef Kokkos::Experimental::Impl::ViewDataAnalysis< DataType , ArrayLayout > data_analysis ; - -public: - - //------------------------------------ - // Data type traits: - - typedef typename data_analysis::type data_type ; - typedef typename data_analysis::const_type const_data_type ; - typedef typename data_analysis::non_const_type non_const_data_type ; - - //------------------------------------ - // Compatible array of trivial type traits: - - typedef typename data_analysis::scalar_array_type scalar_array_type ; - typedef typename data_analysis::const_scalar_array_type const_scalar_array_type ; - typedef typename data_analysis::non_const_scalar_array_type non_const_scalar_array_type ; - - //------------------------------------ - // Value type traits: - - typedef typename data_analysis::value_type value_type ; - typedef typename data_analysis::const_value_type const_value_type ; - typedef typename data_analysis::non_const_value_type non_const_value_type ; - - //------------------------------------ - // Mapping traits: - - typedef ArrayLayout array_layout ; - typedef typename data_analysis::dimension dimension ; - typedef typename data_analysis::specialize specialize /* mapping specialization tag */ ; - - enum { rank = dimension::rank }; - enum { rank_dynamic = dimension::rank_dynamic }; - - //------------------------------------ - // Execution space, memory space, memory access traits, and host mirror space. - - typedef ExecutionSpace execution_space ; - typedef MemorySpace memory_space ; - typedef Kokkos::Device<ExecutionSpace,MemorySpace> device_type ; - typedef MemoryTraits memory_traits ; - typedef HostMirrorSpace host_mirror_space ; - - typedef typename MemorySpace::size_type size_type ; - - enum { is_hostspace = std::is_same< MemorySpace , HostSpace >::value }; - enum { is_managed = MemoryTraits::Unmanaged == 0 }; - enum { is_random_access = MemoryTraits::RandomAccess == 1 }; - - //------------------------------------ -}; - -/** \class View - * \brief View to an array of data. - * - * A View represents an array of one or more dimensions. - * For details, please refer to Kokkos' tutorial materials. - * - * \section Kokkos_View_TemplateParameters Template parameters - * - * This class has both required and optional template parameters. The - * \c DataType parameter must always be provided, and must always be - * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are - * placeholders for different template parameters. The default value - * of the fifth template parameter \c Specialize suffices for most use - * cases. When explaining the template parameters, we won't refer to - * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer - * to the valid categories of template parameters, in whatever order - * they may occur. - * - * Valid ways in which template arguments may be specified: - * - View< DataType > - * - View< DataType , Layout > - * - View< DataType , Layout , Space > - * - View< DataType , Layout , Space , MemoryTraits > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - * - * \tparam DataType (required) This indicates both the type of each - * entry of the array, and the combination of compile-time and - * run-time array dimension(s). For example, <tt>double*</tt> - * indicates a one-dimensional array of \c double with run-time - * dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int - * with run-time first dimension and compile-time second dimension - * (of 3). In general, the run-time dimensions (if any) must go - * first, followed by zero or more compile-time dimensions. For - * more examples, please refer to the tutorial materials. - * - * \tparam Space (required) The memory space. - * - * \tparam Layout (optional) The array's layout in memory. For - * example, LayoutLeft indicates a column-major (Fortran style) - * layout, and LayoutRight a row-major (C style) layout. If not - * specified, this defaults to the preferred layout for the - * <tt>Space</tt>. - * - * \tparam MemoryTraits (optional) Assertion of the user's intended - * access behavior. For example, RandomAccess indicates read-only - * access with limited spatial locality, and Unmanaged lets users - * wrap externally allocated memory in a View without automatic - * deallocation. - * - * \section Kokkos_View_MT MemoryTraits discussion - * - * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on Space - * - * Some \c MemoryTraits options may have different interpretations for - * different \c Space types. For example, with the Cuda device, - * \c RandomAccess tells Kokkos to fetch the data through the texture - * cache, whereas the non-GPU devices have no such hardware construct. - * - * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits - * - * Users should defer applying the optional \c MemoryTraits parameter - * until the point at which they actually plan to rely on it in a - * computational kernel. This minimizes the number of template - * parameters exposed in their code, which reduces the cost of - * compilation. Users may always assign a View without specified - * \c MemoryTraits to a compatible View with that specification. - * For example: - * \code - * // Pass in the simplest types of View possible. - * void - * doSomething (View<double*, Cuda> out, - * View<const double*, Cuda> in) - * { - * // Assign the "generic" View in to a RandomAccess View in_rr. - * // Note that RandomAccess View objects must have const data. - * View<const double*, Cuda, RandomAccess> in_rr = in; - * // ... do something with in_rr and out ... - * } - * \endcode - */ -template< class DataType , class ... Properties > -class View ; - -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#include <impl/KokkosExp_ViewMapping.hpp> -#include <impl/KokkosExp_ViewArray.hpp> - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { - -namespace { - -constexpr Kokkos::Experimental::Impl::ALL_t - ALL = Kokkos::Experimental::Impl::ALL_t(); - -constexpr Kokkos::Experimental::Impl::WithoutInitializing_t - WithoutInitializing = Kokkos::Experimental::Impl::WithoutInitializing_t(); - -constexpr Kokkos::Experimental::Impl::AllowPadding_t - AllowPadding = Kokkos::Experimental::Impl::AllowPadding_t(); - -} - -/** \brief Create View allocation parameter bundle from argument list. - * - * Valid argument list members are: - * 1) label as a "string" or std::string - * 2) memory space instance of the View::memory_space type - * 3) execution space instance compatible with the View::memory_space - * 4) Kokkos::WithoutInitializing to bypass initialization - * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory alignment - */ -template< class ... Args > -inline -Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... > -view_alloc( Args const & ... args ) -{ - typedef - Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... > - return_type ; - - static_assert( ! return_type::has_pointer - , "Cannot give pointer-to-memory for view allocation" ); - - return return_type( args... ); -} - -template< class ... Args > -inline -Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... > -view_wrap( Args const & ... args ) -{ - typedef - Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... > - return_type ; - - static_assert( ! return_type::has_memory_space && - ! return_type::has_execution_space && - ! return_type::has_label && - return_type::has_pointer - , "Must only give pointer-to-memory for view wrapping" ); - - return return_type( args... ); -} - -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { - -template< class DataType , class ... Properties > -class View ; - -template< class > struct is_view : public std::false_type {}; - -template< class D, class ... P > -struct is_view< View<D,P...> > : public std::true_type {}; - -template< class DataType , class ... Properties > -class View : public ViewTraits< DataType , Properties ... > { -private: - - template< class , class ... > friend class View ; - template< class , class ... > friend class Impl::ViewMapping ; - -public: - - typedef ViewTraits< DataType , Properties ... > traits ; - -private: - - typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ; - typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ; - - track_type m_track ; - map_type m_map ; - -public: - - //---------------------------------------- - /** \brief Compatible view of array of scalar types */ - typedef View< typename traits::scalar_array_type , - typename traits::array_layout , - typename traits::device_type , - typename traits::memory_traits > - array_type ; - - /** \brief Compatible view of const data type */ - typedef View< typename traits::const_data_type , - typename traits::array_layout , - typename traits::device_type , - typename traits::memory_traits > - const_type ; - - /** \brief Compatible view of non-const data type */ - typedef View< typename traits::non_const_data_type , - typename traits::array_layout , - typename traits::device_type , - typename traits::memory_traits > - non_const_type ; - - /** \brief Compatible HostMirror view */ - typedef View< typename traits::non_const_data_type , - typename traits::array_layout , - typename traits::host_mirror_space > - HostMirror ; - - //---------------------------------------- - // Domain rank and extents - - enum { Rank = map_type::Rank }; - - template< typename iType > - KOKKOS_INLINE_FUNCTION constexpr - typename std::enable_if< std::is_integral<iType>::value , size_t >::type - extent( const iType & r ) const - { return m_map.extent(r); } - - template< typename iType > - KOKKOS_INLINE_FUNCTION constexpr - typename std::enable_if< std::is_integral<iType>::value , int >::type - extent_int( const iType & r ) const - { return static_cast<int>(m_map.extent(r)); } - - KOKKOS_INLINE_FUNCTION constexpr - typename traits::array_layout layout() const - { return m_map.layout(); } - - //---------------------------------------- - /* Deprecate all 'dimension' functions in favor of - * ISO/C++ vocabulary 'extent'. - */ - - template< typename iType > - KOKKOS_INLINE_FUNCTION constexpr - typename std::enable_if< std::is_integral<iType>::value , size_t >::type - dimension( const iType & r ) const { return extent( r ); } - - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return m_map.dimension_0() * - m_map.dimension_1() * - m_map.dimension_2() * - m_map.dimension_3() * - m_map.dimension_4() * - m_map.dimension_5() * - m_map.dimension_6() * - m_map.dimension_7(); } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); } - - template< typename iType > - KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_map.stride(s); } - - //---------------------------------------- - // Range span is the span which contains all members. - - typedef typename map_type::reference_type reference_type ; - typedef typename map_type::pointer_type pointer_type ; - - enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - // Deprecated, use 'span()' instead - KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_map.span_is_contiguous(); } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return m_map.data(); } - - // Deprecated, use 'span_is_contigous()' instead - KOKKOS_INLINE_FUNCTION constexpr bool is_contiguous() const { return m_map.span_is_contiguous(); } - // Deprecated, use 'data()' instead - KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const { return m_map.data(); } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_INLINE_FUNCTION - const Kokkos::Experimental::Impl::ViewMapping< traits , void > & - implementation_map() const { return m_map ; } - - //---------------------------------------- - -private: - - enum { - is_layout_left = std::is_same< typename traits::array_layout - , Kokkos::LayoutLeft >::value , - - is_layout_right = std::is_same< typename traits::array_layout - , Kokkos::LayoutRight >::value , - - is_layout_stride = std::is_same< typename traits::array_layout - , Kokkos::LayoutStride >::value , - - is_default_map = - std::is_same< typename traits::specialize , void >::value && - ( is_layout_left || is_layout_right || is_layout_stride ) - }; - -#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) - -#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \ - < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \ - Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ; - -#else - -#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \ - < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); - -#endif - -public: - - //------------------------------ - // Rank 0 operator() - - template< class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if<( Kokkos::Impl::are_integral<Args...>::value - && ( 0 == Rank ) - ), reference_type >::type - operator()( Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,args...) ) - - return m_map.reference(); - } - - //------------------------------ - // Rank 1 operator() - - template< typename I0 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,Args...>::value - && ( 1 == Rank ) - && ! is_default_map - ), reference_type >::type - operator()( const I0 & i0 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) ) - - return m_map.reference(i0); - } - - template< typename I0 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,Args...>::value - && ( 1 == Rank ) - && is_default_map - && ! is_layout_stride - ), reference_type >::type - operator()( const I0 & i0 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) ) - - return m_map.m_handle[ i0 ]; - } - - template< typename I0 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,Args...>::value - && ( 1 == Rank ) - && is_default_map - && is_layout_stride - ), reference_type >::type - operator()( const I0 & i0 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) ) - - return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ]; - } - - //------------------------------ - // Rank 1 operator[] - - template< typename I0 > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0>::value - && ( 1 == Rank ) - && ! is_default_map - ), reference_type >::type - operator[]( const I0 & i0 ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) ) - - return m_map.reference(i0); - } - - template< typename I0 > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0>::value - && ( 1 == Rank ) - && is_default_map - && ! is_layout_stride - ), reference_type >::type - operator[]( const I0 & i0 ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) ) - - return m_map.m_handle[ i0 ]; - } - - template< typename I0 > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0>::value - && ( 1 == Rank ) - && is_default_map - && is_layout_stride - ), reference_type >::type - operator[]( const I0 & i0 ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) ) - - return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ]; - } - - //------------------------------ - // Rank 2 - - template< typename I0 , typename I1 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,Args...>::value - && ( 2 == Rank ) - && ! is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - - return m_map.reference(i0,i1); - } - - template< typename I0 , typename I1 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,Args...>::value - && ( 2 == Rank ) - && is_default_map - && is_layout_left && ( traits::rank_dynamic == 0 ) - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - - return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ]; - } - - template< typename I0 , typename I1 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,Args...>::value - && ( 2 == Rank ) - && is_default_map - && is_layout_left && ( traits::rank_dynamic != 0 ) - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - - return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ]; - } - - template< typename I0 , typename I1 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,Args...>::value - && ( 2 == Rank ) - && is_default_map - && is_layout_right && ( traits::rank_dynamic == 0 ) - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - - return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ]; - } - - template< typename I0 , typename I1 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,Args...>::value - && ( 2 == Rank ) - && is_default_map - && is_layout_right && ( traits::rank_dynamic != 0 ) - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - - return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ]; - } - - template< typename I0 , typename I1 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,Args...>::value - && ( 2 == Rank ) - && is_default_map - && is_layout_stride - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - - return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 + - i1 * m_map.m_offset.m_stride.S1 ]; - } - - //------------------------------ - // Rank 3 - - template< typename I0 , typename I1 , typename I2 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value - && ( 3 == Rank ) - && is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) ) - - return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ]; - } - - template< typename I0 , typename I1 , typename I2 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value - && ( 3 == Rank ) - && ! is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) ) - - return m_map.reference(i0,i1,i2); - } - - //------------------------------ - // Rank 4 - - template< typename I0 , typename I1 , typename I2 , typename I3 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value - && ( 4 == Rank ) - && is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) ) - - return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ]; - } - - template< typename I0 , typename I1 , typename I2 , typename I3 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value - && ( 4 == Rank ) - && ! is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) ) - - return m_map.reference(i0,i1,i2,i3); - } - - //------------------------------ - // Rank 5 - - template< typename I0 , typename I1 , typename I2 , typename I3 - , typename I4 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value - && ( 5 == Rank ) - && is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 - , const I4 & i4 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) ) - - return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ]; - } - - template< typename I0 , typename I1 , typename I2 , typename I3 - , typename I4 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value - && ( 5 == Rank ) - && ! is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 - , const I4 & i4 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) ) - - return m_map.reference(i0,i1,i2,i3,i4); - } - - //------------------------------ - // Rank 6 - - template< typename I0 , typename I1 , typename I2 , typename I3 - , typename I4 , typename I5 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value - && ( 6 == Rank ) - && is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 - , const I4 & i4 , const I5 & i5 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) ) - - return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ]; - } - - template< typename I0 , typename I1 , typename I2 , typename I3 - , typename I4 , typename I5 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value - && ( 6 == Rank ) - && ! is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 - , const I4 & i4 , const I5 & i5 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) ) - - return m_map.reference(i0,i1,i2,i3,i4,i5); - } - - //------------------------------ - // Rank 7 - - template< typename I0 , typename I1 , typename I2 , typename I3 - , typename I4 , typename I5 , typename I6 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value - && ( 7 == Rank ) - && is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 - , const I4 & i4 , const I5 & i5 , const I6 & i6 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) ) - - return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ]; - } - - template< typename I0 , typename I1 , typename I2 , typename I3 - , typename I4 , typename I5 , typename I6 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value - && ( 7 == Rank ) - && ! is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 - , const I4 & i4 , const I5 & i5 , const I6 & i6 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) ) - - return m_map.reference(i0,i1,i2,i3,i4,i5,i6); - } - - //------------------------------ - // Rank 8 - - template< typename I0 , typename I1 , typename I2 , typename I3 - , typename I4 , typename I5 , typename I6 , typename I7 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value - && ( 8 == Rank ) - && is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 - , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) ) - - return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ]; - } - - template< typename I0 , typename I1 , typename I2 , typename I3 - , typename I4 , typename I5 , typename I6 , typename I7 - , class ... Args > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if< - ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value - && ( 8 == Rank ) - && ! is_default_map - ), reference_type >::type - operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 - , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 - , Args ... args ) const - { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) ) - - return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7); - } - -#undef KOKKOS_VIEW_OPERATOR_VERIFY - - //---------------------------------------- - // Standard destructor, constructors, and assignment operators - - KOKKOS_INLINE_FUNCTION - ~View() {} - - KOKKOS_INLINE_FUNCTION - View() : m_track(), m_map() {} - - KOKKOS_INLINE_FUNCTION - View( const View & rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {} - - KOKKOS_INLINE_FUNCTION - View( View && rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {} - - KOKKOS_INLINE_FUNCTION - View & operator = ( const View & rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; } - - KOKKOS_INLINE_FUNCTION - View & operator = ( View && rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; } - - //---------------------------------------- - // Compatible view copy constructor and assignment - // may assign unmanaged from managed. - - template< class RT , class ... RP > - KOKKOS_INLINE_FUNCTION - View( const View<RT,RP...> & rhs ) - : m_track( rhs.m_track , traits::is_managed ) - , m_map() - { - typedef typename View<RT,RP...>::traits SrcTraits ; - typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; - static_assert( Mapping::is_assignable , "Incompatible View copy construction" ); - Mapping::assign( m_map , rhs.m_map , rhs.m_track ); - } - - template< class RT , class ... RP > - KOKKOS_INLINE_FUNCTION - View & operator = ( const View<RT,RP...> & rhs ) - { - typedef typename View<RT,RP...>::traits SrcTraits ; - typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; - static_assert( Mapping::is_assignable , "Incompatible View copy assignment" ); - Mapping::assign( m_map , rhs.m_map , rhs.m_track ); - m_track.assign( rhs.m_track , traits::is_managed ); - return *this ; - } - - //---------------------------------------- - // Compatible subview constructor - // may assign unmanaged from managed. - - template< class RT , class ... RP , class Arg0 , class ... Args > - KOKKOS_INLINE_FUNCTION - View( const View< RT , RP... > & src_view - , const Arg0 & arg0 , Args ... args ) - : m_track( src_view.m_track , traits::is_managed ) - , m_map() - { - typedef View< RT , RP... > SrcType ; - - typedef Kokkos::Experimental::Impl::ViewMapping - < void /* deduce destination view type from source view traits */ - , typename SrcType::traits - , Arg0 , Args... > Mapping ; - - typedef typename Mapping::type DstType ; - - static_assert( Kokkos::Experimental::Impl::ViewMapping< traits , typename DstType::traits , void >::is_assignable - , "Subview construction requires compatible view and subview arguments" ); - - Mapping::assign( m_map, src_view.m_map, arg0 , args... ); - } - - //---------------------------------------- - // Allocation tracking properties - - KOKKOS_INLINE_FUNCTION - int use_count() const - { return m_track.use_count(); } - - inline - const std::string label() const - { return m_track.template get_label< typename traits::memory_space >(); } - - //---------------------------------------- - // Allocation according to allocation properties and array layout - - template< class ... P > - explicit inline - View( const Impl::ViewCtorProp< P ... > & arg_prop - , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer - , typename traits::array_layout - >::type const & arg_layout - ) - : m_track() - , m_map() - { - // Append layout and spaces if not input - typedef Impl::ViewCtorProp< P ... > alloc_prop_input ; - - // use 'std::integral_constant<unsigned,I>' for non-types - // to avoid duplicate class error. - typedef Impl::ViewCtorProp - < P ... - , typename std::conditional - < alloc_prop_input::has_label - , std::integral_constant<unsigned,0> - , typename std::string - >::type - , typename std::conditional - < alloc_prop_input::has_memory_space - , std::integral_constant<unsigned,1> - , typename traits::device_type::memory_space - >::type - , typename std::conditional - < alloc_prop_input::has_execution_space - , std::integral_constant<unsigned,2> - , typename traits::device_type::execution_space - >::type - > alloc_prop ; - - static_assert( traits::is_managed - , "View allocation constructor requires managed memory" ); - - if ( alloc_prop::initialize && - ! alloc_prop::execution_space::is_initialized() ) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception("Constructing View and initializing data with uninitialized execution space"); - } - - // Copy the input allocation properties with possibly defaulted properties - alloc_prop prop( arg_prop ); - -//------------------------------------------------------------ -#if defined( KOKKOS_HAVE_CUDA ) - // If allocating in CudaUVMSpace must fence before and after - // the allocation to protect against possible concurrent access - // on the CPU and the GPU. - // Fence using the trait's executon space (which will be Kokkos::Cuda) - // to avoid incomplete type errors from usng Kokkos::Cuda directly. - if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) { - traits::device_type::memory_space::execution_space::fence(); - } -#endif -//------------------------------------------------------------ - - Kokkos::Experimental::Impl::SharedAllocationRecord<> * - record = m_map.allocate_shared( prop , arg_layout ); - -//------------------------------------------------------------ -#if defined( KOKKOS_HAVE_CUDA ) - if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) { - traits::device_type::memory_space::execution_space::fence(); - } -#endif -//------------------------------------------------------------ - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized( record ); - } - - // Wrap memory according to properties and array layout - template< class ... P > - explicit KOKKOS_INLINE_FUNCTION - View( const Impl::ViewCtorProp< P ... > & arg_prop - , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer - , typename traits::array_layout - >::type const & arg_layout - ) - : m_track() // No memory tracking - , m_map( arg_prop , arg_layout ) - { - static_assert( - std::is_same< pointer_type - , typename Impl::ViewCtorProp< P... >::pointer_type - >::value , - "Constructing View to wrap user memory must supply matching pointer type" ); - } - - // Simple dimension-only layout - template< class ... P > - explicit inline - View( const Impl::ViewCtorProp< P ... > & arg_prop - , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer - , size_t - >::type const arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 - ) - : View( arg_prop - , typename traits::array_layout - ( arg_N0 , arg_N1 , arg_N2 , arg_N3 - , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) - ) - {} - - template< class ... P > - explicit KOKKOS_INLINE_FUNCTION - View( const Impl::ViewCtorProp< P ... > & arg_prop - , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer - , size_t - >::type const arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 - ) - : View( arg_prop - , typename traits::array_layout - ( arg_N0 , arg_N1 , arg_N2 , arg_N3 - , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) - ) - {} - - // Allocate with label and layout - template< typename Label > - explicit inline - View( const Label & arg_label - , typename std::enable_if< - Kokkos::Experimental::Impl::is_view_label<Label>::value , - typename traits::array_layout >::type const & arg_layout - ) - : View( Impl::ViewCtorProp< std::string >( arg_label ) , arg_layout ) - {} - - // Allocate label and layout, must disambiguate from subview constructor. - template< typename Label > - explicit inline - View( const Label & arg_label - , typename std::enable_if< - Kokkos::Experimental::Impl::is_view_label<Label>::value , - const size_t >::type arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 - ) - : View( Impl::ViewCtorProp< std::string >( arg_label ) - , typename traits::array_layout - ( arg_N0 , arg_N1 , arg_N2 , arg_N3 - , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) - ) - {} - - // For backward compatibility - explicit inline - View( const ViewAllocateWithoutInitializing & arg_prop - , const typename traits::array_layout & arg_layout - ) - : View( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing ) - , arg_layout - ) - {} - - explicit inline - View( const ViewAllocateWithoutInitializing & arg_prop - , const size_t arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 - ) - : View( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing ) - , typename traits::array_layout - ( arg_N0 , arg_N1 , arg_N2 , arg_N3 - , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) - ) - {} - - //---------------------------------------- - // Memory span required to wrap these dimensions. - static constexpr size_t memory_span( const size_t arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 - ) - { - return map_type::memory_span( - typename traits::array_layout - ( arg_N0 , arg_N1 , arg_N2 , arg_N3 - , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ); - } - - explicit KOKKOS_INLINE_FUNCTION - View( pointer_type arg_ptr - , const size_t arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 - ) - : View( Impl::ViewCtorProp<pointer_type>(arg_ptr) - , typename traits::array_layout - ( arg_N0 , arg_N1 , arg_N2 , arg_N3 - , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) - ) - {} - - explicit KOKKOS_INLINE_FUNCTION - View( pointer_type arg_ptr - , const typename traits::array_layout & arg_layout - ) - : View( Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_layout ) - {} - - //---------------------------------------- - // Shared scratch memory constructor - - static inline - size_t shmem_size( const size_t arg_N0 = 0 , - const size_t arg_N1 = 0 , - const size_t arg_N2 = 0 , - const size_t arg_N3 = 0 , - const size_t arg_N4 = 0 , - const size_t arg_N5 = 0 , - const size_t arg_N6 = 0 , - const size_t arg_N7 = 0 ) - { - return map_type::memory_span( - typename traits::array_layout - ( arg_N0 , arg_N1 , arg_N2 , arg_N3 - , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ); - } - - explicit KOKKOS_INLINE_FUNCTION - View( const typename traits::execution_space::scratch_memory_space & arg_space - , const typename traits::array_layout & arg_layout ) - : View( Impl::ViewCtorProp<pointer_type>( - reinterpret_cast<pointer_type>( - arg_space.get_shmem( map_type::memory_span( arg_layout ) ) ) ) - , arg_layout ) - {} - - explicit KOKKOS_INLINE_FUNCTION - View( const typename traits::execution_space::scratch_memory_space & arg_space - , const size_t arg_N0 = 0 - , const size_t arg_N1 = 0 - , const size_t arg_N2 = 0 - , const size_t arg_N3 = 0 - , const size_t arg_N4 = 0 - , const size_t arg_N5 = 0 - , const size_t arg_N6 = 0 - , const size_t arg_N7 = 0 ) - : View( Impl::ViewCtorProp<pointer_type>( - reinterpret_cast<pointer_type>( - arg_space.get_shmem( - map_type::memory_span( - typename traits::array_layout - ( arg_N0 , arg_N1 , arg_N2 , arg_N3 - , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) ) - , typename traits::array_layout - ( arg_N0 , arg_N1 , arg_N2 , arg_N3 - , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) - ) - {} -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template< class V , class ... Args > -using Subview = - typename Kokkos::Experimental::Impl::ViewMapping - < void /* deduce subview type from source view traits */ - , typename V::traits - , Args ... - >::type ; - -template< class D, class ... P , class ... Args > -KOKKOS_INLINE_FUNCTION -typename Kokkos::Experimental::Impl::ViewMapping - < void /* deduce subview type from source view traits */ - , ViewTraits< D , P... > - , Args ... - >::type -subview( const View< D, P... > & src , Args ... args ) -{ - static_assert( View< D , P... >::Rank == sizeof...(Args) , - "subview requires one argument for each source View rank" ); - - return typename - Kokkos::Experimental::Impl::ViewMapping - < void /* deduce subview type from source view traits */ - , ViewTraits< D , P ... > - , Args ... >::type( src , args ... ); -} - -template< class MemoryTraits , class D, class ... P , class ... Args > -KOKKOS_INLINE_FUNCTION -typename Kokkos::Experimental::Impl::ViewMapping - < void /* deduce subview type from source view traits */ - , ViewTraits< D , P... > - , Args ... - >::template apply< MemoryTraits >::type -subview( const View< D, P... > & src , Args ... args ) -{ - static_assert( View< D , P... >::Rank == sizeof...(Args) , - "subview requires one argument for each source View rank" ); - - return typename - Kokkos::Experimental::Impl::ViewMapping - < void /* deduce subview type from source view traits */ - , ViewTraits< D , P ... > - , Args ... > - ::template apply< MemoryTraits > - ::type( src , args ... ); -} - - - -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { - -template< class LT , class ... LP , class RT , class ... RP > -KOKKOS_INLINE_FUNCTION -bool operator == ( const View<LT,LP...> & lhs , - const View<RT,RP...> & rhs ) -{ - // Same data, layout, dimensions - typedef ViewTraits<LT,LP...> lhs_traits ; - typedef ViewTraits<RT,RP...> rhs_traits ; - - return - std::is_same< typename lhs_traits::const_value_type , - typename rhs_traits::const_value_type >::value && - std::is_same< typename lhs_traits::array_layout , - typename rhs_traits::array_layout >::value && - std::is_same< typename lhs_traits::memory_space , - typename rhs_traits::memory_space >::value && - unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && - lhs.data() == rhs.data() && - lhs.span() == rhs.span() && - lhs.dimension_0() == rhs.dimension_0() && - lhs.dimension_1() == rhs.dimension_1() && - lhs.dimension_2() == rhs.dimension_2() && - lhs.dimension_3() == rhs.dimension_3() && - lhs.dimension_4() == rhs.dimension_4() && - lhs.dimension_5() == rhs.dimension_5() && - lhs.dimension_6() == rhs.dimension_6() && - lhs.dimension_7() == rhs.dimension_7(); -} - -template< class LT , class ... LP , class RT , class ... RP > -KOKKOS_INLINE_FUNCTION -bool operator != ( const View<LT,LP...> & lhs , - const View<RT,RP...> & rhs ) -{ - return ! ( operator==(lhs,rhs) ); -} - -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#if KOKKOS_USING_EXP_VIEW - -inline -void shared_allocation_tracking_claim_and_disable() -{ Kokkos::Experimental::Impl::SharedAllocationRecord<void,void>::tracking_claim_and_disable(); } - -inline -void shared_allocation_tracking_release_and_enable() -{ Kokkos::Experimental::Impl::SharedAllocationRecord<void,void>::tracking_release_and_enable(); } - -#else - -inline -void shared_allocation_tracking_claim_and_disable() -{ Kokkos::Impl::AllocationTracker::disable_tracking(); } - -inline -void shared_allocation_tracking_release_and_enable() -{ Kokkos::Impl::AllocationTracker::enable_tracking(); } - -#endif - -} /* namespace Impl */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -template< class OutputView , typename Enable = void > -struct ViewFill { - - typedef typename OutputView::const_value_type const_value_type ; - - const OutputView output ; - const_value_type input ; - - KOKKOS_INLINE_FUNCTION - void operator()( const size_t i0 ) const - { - const size_t n1 = output.dimension_1(); - const size_t n2 = output.dimension_2(); - const size_t n3 = output.dimension_3(); - const size_t n4 = output.dimension_4(); - const size_t n5 = output.dimension_5(); - const size_t n6 = output.dimension_6(); - const size_t n7 = output.dimension_7(); - - for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { - for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { - for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) { - for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) { - for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) { - for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) { - for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) { - output(i0,i1,i2,i3,i4,i5,i6,i7) = input ; - }}}}}}} - } - - ViewFill( const OutputView & arg_out , const_value_type & arg_in ) - : output( arg_out ), input( arg_in ) - { - typedef typename OutputView::execution_space execution_space ; - typedef Kokkos::RangePolicy< execution_space > Policy ; - - const Kokkos::Impl::ParallelFor< ViewFill , Policy > closure( *this , Policy( 0 , output.dimension_0() ) ); - - closure.execute(); - - execution_space::fence(); - } -}; - -template< class OutputView > -struct ViewFill< OutputView , typename std::enable_if< OutputView::Rank == 0 >::type > { - ViewFill( const OutputView & dst , const typename OutputView::const_value_type & src ) - { - Kokkos::Impl::DeepCopy< typename OutputView::memory_space , Kokkos::HostSpace > - ( dst.data() , & src , sizeof(typename OutputView::const_value_type) ); - } -}; - -template< class OutputView , class InputView > -struct ViewRemap { - - const OutputView output ; - const InputView input ; - const size_t n0 ; - const size_t n1 ; - const size_t n2 ; - const size_t n3 ; - const size_t n4 ; - const size_t n5 ; - const size_t n6 ; - const size_t n7 ; - - ViewRemap( const OutputView & arg_out , const InputView & arg_in ) - : output( arg_out ), input( arg_in ) - , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) ) - , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) ) - , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) ) - , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) ) - , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) ) - , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) ) - , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) ) - , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) ) - { - typedef typename OutputView::execution_space execution_space ; - typedef Kokkos::RangePolicy< execution_space > Policy ; - const Kokkos::Impl::ParallelFor< ViewRemap , Policy > closure( *this , Policy( 0 , n0 ) ); - closure.execute(); - } - - KOKKOS_INLINE_FUNCTION - void operator()( const size_t i0 ) const - { - for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { - for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { - for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) { - for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) { - for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) { - for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) { - for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) { - output(i0,i1,i2,i3,i4,i5,i6,i7) = input(i0,i1,i2,i3,i4,i5,i6,i7); - }}}}}}} - } -}; - -} /* namespace Impl */ -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { - -/** \brief Deep copy a value from Host memory into a view. */ -template< class DT , class ... DP > -inline -void deep_copy - ( const View<DT,DP...> & dst - , typename ViewTraits<DT,DP...>::const_value_type & value - , typename std::enable_if< - std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value - >::type * = 0 ) -{ - static_assert( - std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type , - typename ViewTraits<DT,DP...>::value_type >::value - , "deep_copy requires non-const type" ); - - Kokkos::Experimental::Impl::ViewFill< View<DT,DP...> >( dst , value ); -} - -/** \brief Deep copy into a value in Host memory from a view. */ -template< class ST , class ... SP > -inline -void deep_copy - ( typename ViewTraits<ST,SP...>::non_const_value_type & dst - , const View<ST,SP...> & src - , typename std::enable_if< - std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value - >::type * = 0 ) -{ - static_assert( ViewTraits<ST,SP...>::rank == 0 - , "ERROR: Non-rank-zero view in deep_copy( value , View )" ); - - typedef ViewTraits<ST,SP...> src_traits ; - typedef typename src_traits::memory_space src_memory_space ; - Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) ); -} - -//---------------------------------------------------------------------------- -/** \brief A deep copy between views of compatible type, and rank zero. */ -template< class DT , class ... DP , class ST , class ... SP > -inline -void deep_copy - ( const View<DT,DP...> & dst - , const View<ST,SP...> & src - , typename std::enable_if<( - std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value && - std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value && - ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) && - unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) ) - )>::type * = 0 ) -{ - static_assert( - std::is_same< typename ViewTraits<DT,DP...>::value_type , - typename ViewTraits<ST,SP...>::non_const_value_type >::value - , "deep_copy requires matching non-const destination type" ); - - typedef View<DT,DP...> dst_type ; - typedef View<ST,SP...> src_type ; - - typedef typename dst_type::value_type value_type ; - typedef typename dst_type::memory_space dst_memory_space ; - typedef typename src_type::memory_space src_memory_space ; - - if ( dst.data() != src.data() ) { - Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) ); - } -} - -//---------------------------------------------------------------------------- -/** \brief A deep copy between views of the default specialization, compatible type, - * same non-zero rank, same contiguous layout. - */ -template< class DT , class ... DP , class ST , class ... SP > -inline -void deep_copy - ( const View<DT,DP...> & dst - , const View<ST,SP...> & src - , typename std::enable_if<( - std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value && - std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value && - ( unsigned(ViewTraits<DT,DP...>::rank) != 0 || - unsigned(ViewTraits<ST,SP...>::rank) != 0 ) - )>::type * = 0 ) -{ - static_assert( - std::is_same< typename ViewTraits<DT,DP...>::value_type , - typename ViewTraits<DT,DP...>::non_const_value_type >::value - , "deep_copy requires non-const destination type" ); - - static_assert( - ( unsigned(ViewTraits<DT,DP...>::rank) == - unsigned(ViewTraits<ST,SP...>::rank) ) - , "deep_copy requires Views of equal rank" ); - - typedef View<DT,DP...> dst_type ; - typedef View<ST,SP...> src_type ; - - typedef typename dst_type::execution_space dst_execution_space ; - typedef typename dst_type::memory_space dst_memory_space ; - typedef typename src_type::memory_space src_memory_space ; - - enum { DstExecCanAccessSrc = - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value }; - - if ( (void *) dst.data() != (void*) src.data() ) { - - // Concern: If overlapping views then a parallel copy will be erroneous. - // ... - - // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy - - if ( std::is_same< typename ViewTraits<DT,DP...>::value_type , - typename ViewTraits<ST,SP...>::non_const_value_type >::value && - ( - std::is_same< typename ViewTraits<DT,DP...>::array_layout , - typename ViewTraits<ST,SP...>::array_layout >::value - || - ( ViewTraits<DT,DP...>::rank == 1 && - ViewTraits<ST,SP...>::rank == 1 ) - ) && - dst.span_is_contiguous() && - src.span_is_contiguous() && - dst.span() == src.span() && - dst.dimension_0() == src.dimension_0() && - dst.dimension_1() == src.dimension_1() && - dst.dimension_2() == src.dimension_2() && - dst.dimension_3() == src.dimension_3() && - dst.dimension_4() == src.dimension_4() && - dst.dimension_5() == src.dimension_5() && - dst.dimension_6() == src.dimension_6() && - dst.dimension_7() == src.dimension_7() ) { - - const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); - - Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes ); - } - else if ( DstExecCanAccessSrc ) { - // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. - Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src ); - } - else { - Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation"); - } - } -} - -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { - -/** \brief Deep copy a value from Host memory into a view. */ -template< class ExecSpace ,class DT , class ... DP > -inline -void deep_copy - ( const ExecSpace & - , const View<DT,DP...> & dst - , typename ViewTraits<DT,DP...>::const_value_type & value - , typename std::enable_if< - Kokkos::Impl::is_execution_space< ExecSpace >::value && - std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value - >::type * = 0 ) -{ - static_assert( - std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type , - typename ViewTraits<DT,DP...>::value_type >::value - , "deep_copy requires non-const type" ); - - Kokkos::Experimental::Impl::ViewFill< View<DT,DP...> >( dst , value ); -} - -/** \brief Deep copy into a value in Host memory from a view. */ -template< class ExecSpace , class ST , class ... SP > -inline -void deep_copy - ( const ExecSpace & exec_space - , typename ViewTraits<ST,SP...>::non_const_value_type & dst - , const View<ST,SP...> & src - , typename std::enable_if< - Kokkos::Impl::is_execution_space< ExecSpace >::value && - std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value - >::type * = 0 ) -{ - static_assert( ViewTraits<ST,SP...>::rank == 0 - , "ERROR: Non-rank-zero view in deep_copy( value , View )" ); - - typedef ViewTraits<ST,SP...> src_traits ; - typedef typename src_traits::memory_space src_memory_space ; - Kokkos::Impl::DeepCopy< HostSpace , src_memory_space , ExecSpace > - ( exec_space , & dst , src.data() , sizeof(ST) ); -} - -//---------------------------------------------------------------------------- -/** \brief A deep copy between views of compatible type, and rank zero. */ -template< class ExecSpace , class DT , class ... DP , class ST , class ... SP > -inline -void deep_copy - ( const ExecSpace & exec_space - , const View<DT,DP...> & dst - , const View<ST,SP...> & src - , typename std::enable_if<( - Kokkos::Impl::is_execution_space< ExecSpace >::value && - std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value && - std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value && - ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) && - unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) ) - )>::type * = 0 ) -{ - static_assert( - std::is_same< typename ViewTraits<DT,DP...>::value_type , - typename ViewTraits<ST,SP...>::non_const_value_type >::value - , "deep_copy requires matching non-const destination type" ); - - typedef View<DT,DP...> dst_type ; - typedef View<ST,SP...> src_type ; - - typedef typename dst_type::value_type value_type ; - typedef typename dst_type::memory_space dst_memory_space ; - typedef typename src_type::memory_space src_memory_space ; - - if ( dst.data() != src.data() ) { - Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace > - ( exec_space , dst.data() , src.data() , sizeof(value_type) ); - } -} - -//---------------------------------------------------------------------------- -/** \brief A deep copy between views of the default specialization, compatible type, - * same non-zero rank, same contiguous layout. - */ -template< class ExecSpace , class DT, class ... DP, class ST, class ... SP > -inline -void deep_copy - ( const ExecSpace & exec_space - , const View<DT,DP...> & dst - , const View<ST,SP...> & src - , typename std::enable_if<( - Kokkos::Impl::is_execution_space< ExecSpace >::value && - std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value && - std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value && - ( unsigned(ViewTraits<DT,DP...>::rank) != 0 || - unsigned(ViewTraits<ST,SP...>::rank) != 0 ) - )>::type * = 0 ) -{ - static_assert( - std::is_same< typename ViewTraits<DT,DP...>::value_type , - typename ViewTraits<DT,DP...>::non_const_value_type >::value - , "deep_copy requires non-const destination type" ); - - static_assert( - ( unsigned(ViewTraits<DT,DP...>::rank) == - unsigned(ViewTraits<ST,SP...>::rank) ) - , "deep_copy requires Views of equal rank" ); - - typedef View<DT,DP...> dst_type ; - typedef View<ST,SP...> src_type ; - - typedef typename dst_type::execution_space dst_execution_space ; - typedef typename dst_type::memory_space dst_memory_space ; - typedef typename src_type::memory_space src_memory_space ; - - enum { DstExecCanAccessSrc = - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value }; - - if ( (void *) dst.data() != (void*) src.data() ) { - - // Concern: If overlapping views then a parallel copy will be erroneous. - // ... - - // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy - - if ( std::is_same< typename ViewTraits<DT,DP...>::value_type , - typename ViewTraits<ST,SP...>::non_const_value_type >::value && - ( - std::is_same< typename ViewTraits<DT,DP...>::array_layout , - typename ViewTraits<ST,SP...>::array_layout >::value - || - ( ViewTraits<DT,DP...>::rank == 1 && - ViewTraits<ST,SP...>::rank == 1 ) - ) && - dst.span_is_contiguous() && - src.span_is_contiguous() && - dst.span() == src.span() && - dst.dimension_0() == src.dimension_0() && - dst.dimension_1() == src.dimension_1() && - dst.dimension_2() == src.dimension_2() && - dst.dimension_3() == src.dimension_3() && - dst.dimension_4() == src.dimension_4() && - dst.dimension_5() == src.dimension_5() && - dst.dimension_6() == src.dimension_6() && - dst.dimension_7() == src.dimension_7() ) { - - const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); - - Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace > - ( exec_space , dst.data() , src.data() , nbytes ); - } - else if ( DstExecCanAccessSrc ) { - // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. - Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src ); - } - else { - Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation"); - } - } -} - -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -// Deduce Mirror Types -template<class Space, class T, class ... P> -struct MirrorViewType { - // The incoming view_type - typedef typename Kokkos::Experimental::View<T,P...> src_view_type; - // The memory space for the mirror view - typedef typename Space::memory_space memory_space; - // Check whether it is the same memory space - enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value }; - // The array_layout - typedef typename src_view_type::array_layout array_layout; - // The data type (we probably want it non-const since otherwise we can't even deep_copy to it. - typedef typename src_view_type::non_const_data_type data_type; - // The destination view type if it is not the same memory space - typedef Kokkos::Experimental::View<data_type,array_layout,Space> dest_view_type; - // If it is the same memory_space return the existsing view_type - // This will also keep the unmanaged trait if necessary - typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type; -}; - -template<class Space, class T, class ... P> -struct MirrorType { - // The incoming view_type - typedef typename Kokkos::Experimental::View<T,P...> src_view_type; - // The memory space for the mirror view - typedef typename Space::memory_space memory_space; - // Check whether it is the same memory space - enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value }; - // The array_layout - typedef typename src_view_type::array_layout array_layout; - // The data type (we probably want it non-const since otherwise we can't even deep_copy to it. - typedef typename src_view_type::non_const_data_type data_type; - // The destination view type if it is not the same memory space - typedef Kokkos::Experimental::View<data_type,array_layout,Space> view_type; -}; - -} - -template< class T , class ... P > -inline -typename Kokkos::Experimental::View<T,P...>::HostMirror -create_mirror( const Kokkos::Experimental::View<T,P...> & src - , typename std::enable_if< - ! std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout - , Kokkos::LayoutStride >::value - >::type * = 0 - ) -{ - typedef View<T,P...> src_type ; - typedef typename src_type::HostMirror dst_type ; - - return dst_type( std::string( src.label() ).append("_mirror") - , src.dimension_0() - , src.dimension_1() - , src.dimension_2() - , src.dimension_3() - , src.dimension_4() - , src.dimension_5() - , src.dimension_6() - , src.dimension_7() ); -} - -template< class T , class ... P > -inline -typename Kokkos::Experimental::View<T,P...>::HostMirror -create_mirror( const Kokkos::Experimental::View<T,P...> & src - , typename std::enable_if< - std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout - , Kokkos::LayoutStride >::value - >::type * = 0 - ) -{ - typedef View<T,P...> src_type ; - typedef typename src_type::HostMirror dst_type ; - - Kokkos::LayoutStride layout ; - - layout.dimension[0] = src.dimension_0(); - layout.dimension[1] = src.dimension_1(); - layout.dimension[2] = src.dimension_2(); - layout.dimension[3] = src.dimension_3(); - layout.dimension[4] = src.dimension_4(); - layout.dimension[5] = src.dimension_5(); - layout.dimension[6] = src.dimension_6(); - layout.dimension[7] = src.dimension_7(); - - layout.stride[0] = src.stride_0(); - layout.stride[1] = src.stride_1(); - layout.stride[2] = src.stride_2(); - layout.stride[3] = src.stride_3(); - layout.stride[4] = src.stride_4(); - layout.stride[5] = src.stride_5(); - layout.stride[6] = src.stride_6(); - layout.stride[7] = src.stride_7(); - - return dst_type( std::string( src.label() ).append("_mirror") , layout ); -} - - -// Create a mirror in a new space (specialization for different space) -template<class Space, class T, class ... P> -typename Impl::MirrorType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::Experimental::View<T,P...> & src) { - return typename Impl::MirrorType<Space,T,P ...>::view_type(src.label(),src.layout()); -} - -template< class T , class ... P > -inline -typename Kokkos::Experimental::View<T,P...>::HostMirror -create_mirror_view( const Kokkos::Experimental::View<T,P...> & src - , typename std::enable_if<( - std::is_same< typename Kokkos::Experimental::View<T,P...>::memory_space - , typename Kokkos::Experimental::View<T,P...>::HostMirror::memory_space - >::value - && - std::is_same< typename Kokkos::Experimental::View<T,P...>::data_type - , typename Kokkos::Experimental::View<T,P...>::HostMirror::data_type - >::value - )>::type * = 0 - ) -{ - return src ; -} - -template< class T , class ... P > -inline -typename Kokkos::Experimental::View<T,P...>::HostMirror -create_mirror_view( const Kokkos::Experimental::View<T,P...> & src - , typename std::enable_if< ! ( - std::is_same< typename Kokkos::Experimental::View<T,P...>::memory_space - , typename Kokkos::Experimental::View<T,P...>::HostMirror::memory_space - >::value - && - std::is_same< typename Kokkos::Experimental::View<T,P...>::data_type - , typename Kokkos::Experimental::View<T,P...>::HostMirror::data_type - >::value - )>::type * = 0 - ) -{ - return Kokkos::Experimental::create_mirror( src ); -} - -// Create a mirror view in a new space (specialization for same space) -template<class Space, class T, class ... P> -typename Impl::MirrorViewType<Space,T,P ...>::view_type -create_mirror_view(const Space& , const Kokkos::Experimental::View<T,P...> & src - , typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) { - return src; -} - -// Create a mirror view in a new space (specialization for different space) -template<class Space, class T, class ... P> -typename Impl::MirrorViewType<Space,T,P ...>::view_type -create_mirror_view(const Space& , const Kokkos::Experimental::View<T,P...> & src - , typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) { - return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout()); -} - -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { - -/** \brief Resize a view with copying old data to new data at the corresponding indices. */ -template< class T , class ... P > -inline -void resize( Kokkos::Experimental::View<T,P...> & v , - const size_t n0 = 0 , - const size_t n1 = 0 , - const size_t n2 = 0 , - const size_t n3 = 0 , - const size_t n4 = 0 , - const size_t n5 = 0 , - const size_t n6 = 0 , - const size_t n7 = 0 ) -{ - typedef Kokkos::Experimental::View<T,P...> view_type ; - - static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only resize managed views" ); - - view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 ); - - Kokkos::Experimental::Impl::ViewRemap< view_type , view_type >( v_resized , v ); - - v = v_resized ; -} - -/** \brief Resize a view with copying old data to new data at the corresponding indices. */ -template< class T , class ... P > -inline -void realloc( Kokkos::Experimental::View<T,P...> & v , - const size_t n0 = 0 , - const size_t n1 = 0 , - const size_t n2 = 0 , - const size_t n3 = 0 , - const size_t n4 = 0 , - const size_t n5 = 0 , - const size_t n6 = 0 , - const size_t n7 = 0 ) -{ - typedef Kokkos::Experimental::View<T,P...> view_type ; - - static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" ); - - const std::string label = v.label(); - - v = view_type(); // Deallocate first, if the only view to allocation - v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 ); -} - -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#if KOKKOS_USING_EXP_VIEW - -namespace Kokkos { - -template< class D , class ... P > -using ViewTraits = Kokkos::Experimental::ViewTraits<D,P...> ; - -template< class D , class ... P > -using View = Kokkos::Experimental::View<D,P...> ; - -using Kokkos::Experimental::ALL ; -using Kokkos::Experimental::deep_copy ; -using Kokkos::Experimental::create_mirror ; -using Kokkos::Experimental::create_mirror_view ; -using Kokkos::Experimental::subview ; -using Kokkos::Experimental::resize ; -using Kokkos::Experimental::realloc ; -using Kokkos::Experimental::is_view ; - -namespace Impl { - -using Kokkos::Experimental::is_view ; - -class ViewDefault {}; - -template< class SrcViewType - , class Arg0Type - , class Arg1Type - , class Arg2Type - , class Arg3Type - , class Arg4Type - , class Arg5Type - , class Arg6Type - , class Arg7Type - > -struct ViewSubview /* { typedef ... type ; } */ ; - -} - -} /* namespace Kokkos */ - -#include <impl/Kokkos_Atomic_View.hpp> - -#endif /* #if KOKKOS_USING_EXP_VIEW */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif - diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index 11aaf96177..cdfa4429f0 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -121,13 +121,22 @@ public: return *this; } - //! Assignment operator. + /// \brief Assignment operator, for volatile <tt>*this</tt> and + /// nonvolatile input. + /// + /// \param src [in] Input; right-hand side of the assignment. + /// + /// This operator returns \c void instead of <tt>volatile + /// complex<RealType>& </tt>. See Kokkos Issue #177 for the + /// explanation. In practice, this means that you should not chain + /// assignments with volatile lvalues. template<class InputRealType> KOKKOS_INLINE_FUNCTION - volatile complex<RealType>& operator= (const complex<InputRealType>& src) volatile { + void operator= (const complex<InputRealType>& src) volatile { re_ = src.re_; im_ = src.im_; - return *this; + // We deliberately do not return anything here. See explanation + // in public documentation above. } //! Assignment operator. diff --git a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp similarity index 56% rename from lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp rename to lib/kokkos/core/src/Kokkos_Concepts.hpp index ad3e0b35a5..82a342eec0 100644 --- a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp +++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,86 +36,43 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ -#ifndef KOKKOS_BASIC_ALLOCATORS_HPP -#define KOKKOS_BASIC_ALLOCATORS_HPP - -#if ! KOKKOS_USING_EXP_VIEW - -namespace Kokkos { namespace Impl { - -/// class UnmanagedAllocator -/// does nothing when deallocate(ptr,size) is called -class UnmanagedAllocator -{ -public: - static const char * name() { return "Unmanaged Allocator"; } - - static void deallocate(void * /*ptr*/, size_t /*size*/) {} -}; - - -/// class MallocAllocator -class MallocAllocator -{ -public: - static const char * name() - { - return "Malloc Allocator"; - } - - static void* allocate(size_t size); - - static void deallocate(void * ptr, size_t size); +#ifndef KOKKOS_CORE_CONCEPTS_HPP +#define KOKKOS_CORE_CONCEPTS_HPP - static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); -}; +#include <type_traits> +namespace Kokkos { +//Schedules for Execution Policies +struct Static {}; +struct Dynamic {}; -/// class AlignedAllocator -/// memory aligned to Kokkos::Impl::MEMORY_ALIGNMENT -class AlignedAllocator +//Schedule Wrapper Type +template<class T> +struct Schedule { -public: - static const char * name() - { - return "Aligned Allocator"; - } - - static void* allocate(size_t size); - - static void deallocate(void * ptr, size_t size); - - static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); + static_assert( std::is_same<T,Static>::value + || std::is_same<T,Dynamic>::value + , "Kokkos: Invalid Schedule<> type." + ); + using schedule_type = Schedule<T>; + using type = T; }; - -/// class PageAlignedAllocator -/// memory aligned to PAGE_SIZE -class PageAlignedAllocator +//Specify Iteration Index Type +template<typename T> +struct IndexType { -public: - static const char * name() - { - return "Page Aligned Allocator"; - } - - static void* allocate(size_t size); - - static void deallocate(void * ptr, size_t size); - - static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); + static_assert(std::is_integral<T>::value,"Kokkos: Invalid IndexType<>."); + using index_type = IndexType<T>; + using type = T; }; +} // namespace Kokkos -}} // namespace Kokkos::Impl - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - -#endif //KOKKOS_BASIC_ALLOCATORS_HPP - +#endif // KOKKOS_CORE_CONCEPTS_HPP diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index e4a4643ce5..7cde4610ee 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -159,8 +159,6 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size ) } // namespace Kokkos -#if KOKKOS_USING_EXP_VIEW - namespace Kokkos { using Kokkos::Experimental::kokkos_malloc ; @@ -169,76 +167,6 @@ using Kokkos::Experimental::kokkos_free ; } -#else - -namespace Kokkos { - -namespace Impl { -// should only by used by kokkos_malloc and kokkos_free -struct MallocHelper -{ - static void increment_ref_count( AllocationTracker const & tracker ) - { - tracker.increment_ref_count(); - } - - static void decrement_ref_count( AllocationTracker const & tracker ) - { - tracker.decrement_ref_count(); - } -}; -} // namespace Impl - -/* Allocate memory from a memory space. - * The allocation is tracked in Kokkos memory tracking system, so - * leaked memory can be identified. - */ -template< class Arg = DefaultExecutionSpace> -void* kokkos_malloc(const std::string label, size_t count) { - if(count == 0) return NULL; - typedef typename Arg::memory_space MemorySpace; - Impl::AllocationTracker tracker = MemorySpace::allocate_and_track(label,count);; - Impl::MallocHelper::increment_ref_count( tracker ); - return tracker.alloc_ptr(); -} - -template< class Arg = DefaultExecutionSpace> -void* kokkos_malloc(const size_t& count) { - return kokkos_malloc<Arg>("DefaultLabel",count); -} - - -/* Free memory from a memory space. - */ -template< class Arg = DefaultExecutionSpace> -void kokkos_free(const void* ptr) { - typedef typename Arg::memory_space MemorySpace; - typedef typename MemorySpace::allocator allocator; - Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(ptr); - if (tracker.is_valid()) { - Impl::MallocHelper::decrement_ref_count( tracker ); - } -} - - -template< class Arg = DefaultExecutionSpace> -void* kokkos_realloc(const void* old_ptr, size_t size) { - if(old_ptr == NULL) - return kokkos_malloc<Arg>(size); - - typedef typename Arg::memory_space MemorySpace; - typedef typename MemorySpace::allocator allocator; - Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr); - - tracker.reallocate(size); - - return tracker.alloc_ptr(); -} - -} // namespace Kokkos - -#endif - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index a262864157..e9648b59b8 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -69,6 +69,9 @@ namespace { /**\brief Token to indicate that a parameter's value is to be automatically selected */ constexpr AUTO_t AUTO = Kokkos::AUTO_t(); } + +struct InvalidType {}; + } //---------------------------------------------------------------------------- @@ -205,7 +208,7 @@ namespace Impl { template< class Functor , class Policy , class EnableFunctor = void - , class EnablePolicy = void + , class EnablePolicy = void > struct FunctorPolicyExecutionSpace; @@ -225,7 +228,7 @@ template< class FunctorType , class ExecPolicy , class ExecutionSpace = /// /// This is an implementation detail of parallel_reduce. Users should /// skip this and go directly to the nonmember function parallel_reduce. -template< class FunctorType , class ExecPolicy , class ExecutionSpace = +template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace = typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space > class ParallelReduce ; diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp index eadcf13256..3130ee3198 100644 --- a/lib/kokkos/core/src/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -56,11 +56,14 @@ #include <Kokkos_CudaSpace.hpp> #include <Kokkos_Parallel.hpp> +#include <Kokkos_TaskPolicy.hpp> #include <Kokkos_Layout.hpp> #include <Kokkos_ScratchSpace.hpp> #include <Kokkos_MemoryTraits.hpp> #include <impl/Kokkos_Tags.hpp> +#include <KokkosExp_MDRangePolicy.hpp> + /*--------------------------------------------------------------------------*/ namespace Kokkos { @@ -108,7 +111,7 @@ public: //! This execution space's preferred array layout. typedef LayoutLeft array_layout ; - //! + //! typedef ScratchMemorySpace< Cuda > scratch_memory_space ; //@} @@ -257,10 +260,10 @@ struct VerifyExecutionCanAccessMemorySpace #include <Cuda/Kokkos_CudaExec.hpp> #include <Cuda/Kokkos_Cuda_View.hpp> -#include <KokkosExp_View.hpp> #include <Cuda/KokkosExp_Cuda_View.hpp> #include <Cuda/Kokkos_Cuda_Parallel.hpp> +#include <Cuda/Kokkos_Cuda_Task.hpp> //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp index c0223c35cf..cd728895d0 100644 --- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp @@ -54,10 +54,7 @@ #include <Kokkos_HostSpace.hpp> -#include <impl/Kokkos_AllocationTracker.hpp> - #include <Cuda/Kokkos_Cuda_abort.hpp> -#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp> /*--------------------------------------------------------------------------*/ @@ -77,33 +74,6 @@ public: /*--------------------------------*/ -#if ! KOKKOS_USING_EXP_VIEW - - typedef Impl::CudaMallocAllocator allocator; - - /** \brief Allocate a contiguous block of memory. - * - * The input label is associated with the block of memory. - * The block of memory is tracked via reference counting where - * allocation gives it a reference count of one. - */ - static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size ); - - /*--------------------------------*/ - /** \brief Cuda specific function to attached texture object to an allocation. - * Output the texture object, base pointer, and offset from the input pointer. - */ -#if defined( __CUDACC__ ) - static void texture_object_attach( Impl::AllocationTracker const & tracker - , unsigned type_size - , ::cudaChannelFormatDesc const & desc - ); -#endif - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - - /*--------------------------------*/ - CudaSpace(); CudaSpace( CudaSpace && rhs ) = default ; CudaSpace( const CudaSpace & rhs ) = default ; @@ -137,7 +107,7 @@ namespace Impl { /// where the hash value is derived from the address of the /// object for which an atomic operation is performed. /// This function initializes the locks to zero (unset). -void init_lock_array_cuda_space(); +void init_lock_arrays_cuda_space(); /// \brief Retrieve the pointer to the lock array for arbitrary size atomics. /// @@ -146,7 +116,23 @@ void init_lock_array_cuda_space(); /// object for which an atomic operation is performed. /// This function retrieves the lock array pointer. /// If the array is not yet allocated it will do so. -int* lock_array_cuda_space_ptr(bool deallocate = false); +int* atomic_lock_array_cuda_space_ptr(bool deallocate = false); + +/// \brief Retrieve the pointer to the scratch array for team and thread private global memory. +/// +/// Team and Thread private scratch allocations in +/// global memory are aquired via locks. +/// This function retrieves the lock array pointer. +/// If the array is not yet allocated it will do so. +int* scratch_lock_array_cuda_space_ptr(bool deallocate = false); + +/// \brief Retrieve the pointer to the scratch array for unique identifiers. +/// +/// Unique identifiers in the range 0-Cuda::concurrency +/// are provided via locks. +/// This function retrieves the lock array pointer. +/// If the array is not yet allocated it will do so. +int* threadid_lock_array_cuda_space_ptr(bool deallocate = false); } } // namespace Kokkos @@ -172,33 +158,6 @@ public: /*--------------------------------*/ -#if ! KOKKOS_USING_EXP_VIEW - - typedef Impl::CudaUVMAllocator allocator; - - /** \brief Allocate a contiguous block of memory. - * - * The input label is associated with the block of memory. - * The block of memory is tracked via reference counting where - * allocation gives it a reference count of one. - */ - static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size ); - - - /** \brief Cuda specific function to attached texture object to an allocation. - * Output the texture object, base pointer, and offset from the input pointer. - */ -#if defined( __CUDACC__ ) - static void texture_object_attach( Impl::AllocationTracker const & tracker - , unsigned type_size - , ::cudaChannelFormatDesc const & desc - ); -#endif - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - - /*--------------------------------*/ - CudaUVMSpace(); CudaUVMSpace( CudaUVMSpace && rhs ) = default ; CudaUVMSpace( const CudaUVMSpace & rhs ) = default ; @@ -242,22 +201,6 @@ public: /*--------------------------------*/ -#if ! KOKKOS_USING_EXP_VIEW - - typedef Impl::CudaHostAllocator allocator ; - - /** \brief Allocate a contiguous block of memory. - * - * The input label is associated with the block of memory. - * The block of memory is tracked via reference counting where - * allocation gives it a reference count of one. - */ - static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size ); - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - - /*--------------------------------*/ - CudaHostPinnedSpace(); CudaHostPinnedSpace( CudaHostPinnedSpace && rhs ) = default ; CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ; diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index 8489978f54..5834fc04db 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -47,167 +47,15 @@ #include <Kokkos_Core_fwd.hpp> #include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_StaticAssert.hpp> +#include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_AnalyzePolicy.hpp> +#include <Kokkos_Concepts.hpp> #include <iostream> //---------------------------------------------------------------------------- namespace Kokkos { -//Schedules for Execution Policies -struct Static { -}; - -struct Dynamic { -}; - -//Schedule Wrapper Type -template<class ScheduleType> -struct Schedule { - static_assert(std::is_same<ScheduleType,Static>::value || - std::is_same<ScheduleType,Dynamic>::value, - "Kokkos: Invalid Schedule<> type."); - typedef Schedule<ScheduleType> schedule_type; - typedef ScheduleType type; -}; - -//Specif Iteration Index Type -template<typename iType> -struct IndexType { - static_assert(std::is_integral<iType>::value,"Kokkos: Invalid IndexType<>."); - typedef IndexType<iType> index_type; - typedef iType type; -}; - -namespace Impl { - -template<class Arg> -struct is_schedule_type { - enum { value = 0}; -}; - -template<class ScheduleType> -struct is_schedule_type<Schedule<ScheduleType> > { - enum {value = 1 }; -}; - -template<class Arg> -struct is_index_type { - enum { value = 0 }; -}; - -template<typename iType> -struct is_index_type<IndexType<iType> > { - enum { value = 1 }; -}; - -template<typename Arg> -struct is_tag_type { - enum { value = !(is_execution_space<Arg>::value || - is_schedule_type<Arg>::value || - is_index_type<Arg>::value || - std::is_integral<Arg>::value)}; -}; - -//Policy Traits -template<class ... Properties> -struct PolicyTraits; - -template<> -struct PolicyTraits<void> { - typedef void execution_space; - typedef void schedule_type; - typedef void index_type; - typedef void tag_type; -}; - - -//Strip off ExecutionSpace -template<class ExecutionSpace, class ... Props> -struct PolicyTraits<typename std::enable_if<is_execution_space<ExecutionSpace>::value >::type,ExecutionSpace,Props ...> { - static_assert( std::is_same<typename PolicyTraits<void, Props ...>::execution_space, void>::value, - "ExecutionPolicy: Only one execution space template argument may be used."); - typedef ExecutionSpace execution_space; - typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type; - typedef typename PolicyTraits<void, Props ...>::index_type index_type; - typedef typename PolicyTraits<void, Props ...>::tag_type tag_type; -}; - -//Strip off ScheduleType -template<class ScheduleType, class ... Props> -struct PolicyTraits<typename std::enable_if<is_schedule_type<Schedule<ScheduleType> >::value >::type,Schedule<ScheduleType>,Props ...> { - static_assert( std::is_same<typename PolicyTraits<void, Props ...>::schedule_type, void>::value, - "ExecutionPolicy: Only one Schedule<..> template argument may be used."); - typedef typename PolicyTraits<void, Props ...>::execution_space execution_space; - typedef ScheduleType schedule_type; - typedef typename PolicyTraits<void, Props ...>::index_type index_type; - typedef typename PolicyTraits<void, Props ...>::tag_type tag_type; -}; - -//Strip off IndexType -template<typename iType, class ... Props> -struct PolicyTraits<void, IndexType<iType>,Props ...> { - static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value, - "ExecutionPolicy: Only one IndexType<..> template argument may be used."); - typedef typename PolicyTraits<void, Props ...>::execution_space execution_space; - typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type; - typedef iType index_type; - typedef typename PolicyTraits<void, Props ...>::tag_type tag_type; -}; - -//Strip off raw IndexType -template<typename iType, class ... Props> -struct PolicyTraits<typename std::enable_if<std::is_integral<iType>::value>::type, iType,Props ...> { - static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value, - "ExecutionPolicy: Only one IndexType<..> template argument may be used."); - typedef typename PolicyTraits<void, Props ...>::execution_space execution_space; - typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type; - typedef iType index_type; - typedef typename PolicyTraits<void, Props ...>::tag_type tag_type; -}; - -//Strip off TagType -template<class TagType, class ... Props> -struct PolicyTraits<typename std::enable_if<!is_schedule_type<TagType>::value && - !is_execution_space<TagType>::value && - !is_index_type<TagType>::value && - !std::is_integral<TagType>::value - >::type, - TagType,Props ...> { - static_assert( std::is_same<typename PolicyTraits<void, Props ...>::tag_type, void>::value, - "ExecutionPolicy: Only one tag type template argument may be used."); - - typedef typename PolicyTraits<void, Props ...>::execution_space execution_space; - typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type; - typedef typename PolicyTraits<void, Props ...>::index_type index_type; - typedef TagType tag_type; -}; - - -template<class ... Props> -struct PolicyTraits { -#ifdef KOKKOS_DIRECT_VARIADIC_EXPANSION - typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::execution_space>::value, - Kokkos::DefaultExecutionSpace, typename PolicyTraits<void,Props ...>::execution_space>::type execution_space; - typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::schedule_type>::value, - Kokkos::Static, typename PolicyTraits<void,Props ...>::schedule_type>::type schedule_type; - typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::index_type>::value, - typename execution_space::size_type, typename PolicyTraits<void,Props ...>::index_type>::type index_type; - typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::tag_type>::value, - void, typename PolicyTraits<void,Props ...>::tag_type>::type work_tag; -#else - typedef typename has_condition<Kokkos::DefaultExecutionSpace,is_execution_space,Props ...>::type execution_space; - typedef typename has_condition<Kokkos::Schedule<Kokkos::Static>,is_schedule_type,Props ...>::type schedule_type; - typedef typename has_condition<void,is_tag_type,Props ...>::type work_tag; - typedef typename has_condition<typename execution_space::size_type, std::is_integral, Props ... >::type default_index_type; - typedef typename has_condition<Kokkos::IndexType<default_index_type>,is_index_type,Props ...>::type::type index_type; -#endif -}; - -} - -} - -namespace Kokkos { /** \brief Execution policy for work over a range of an integral type. * * Valid template argument options: @@ -230,7 +78,9 @@ namespace Kokkos { * Blocking is the granularity of partitioning the range among threads. */ template<class ... Properties> -class RangePolicy: public Impl::PolicyTraits<Properties ... > { +class RangePolicy + : public Impl::PolicyTraits<Properties ... > +{ private: typedef Impl::PolicyTraits<Properties ... > traits; @@ -243,6 +93,7 @@ private: public: //! Tag this class as an execution policy + typedef RangePolicy execution_policy; typedef typename traits::index_type member_type ; KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; } @@ -348,7 +199,7 @@ public: : m_begin(0), m_end(0) { if ( part_size ) { - + // Split evenly among partitions, then round up to the granularity. const member_type work_part = ( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size ) @@ -356,7 +207,7 @@ public: m_begin = range.begin() + work_part * part_rank ; m_end = m_begin + work_part ; - + if ( range.end() < m_begin ) m_begin = range.end() ; if ( range.end() < m_end ) m_end = range.end() ; } @@ -366,10 +217,11 @@ public: member_type m_end ; WorkRange(); WorkRange & operator = ( const WorkRange & ); - + }; }; + } // namespace Kokkos //---------------------------------------------------------------------------- @@ -377,38 +229,6 @@ public: namespace Kokkos { -namespace Experimental { - -/** \brief Scratch memory request accepting per team and per thread value - * - * An instance of this class can be given as the last argument to a - * TeamPolicy constructor. It sets the amount of user requested shared - * memory for the team. - */ - -template< class MemorySpace > -class TeamScratchRequest { - size_t m_per_team; - size_t m_per_thread; - -public: - TeamScratchRequest(size_t per_team_, size_t per_thread_ = 0): - m_per_team(per_team_), m_per_thread(per_thread_) { - } - - size_t per_team() const { - return m_per_team; - } - size_t per_thread() const { - return m_per_thread; - } - size_t total(const size_t team_size) const { - return m_per_team + m_per_thread * team_size; - } -}; - -} - namespace Impl { @@ -451,11 +271,9 @@ public: TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 ); - template<class MemorySpace> - TeamPolicyInternal( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request ); +/* TeamPolicyInternal( int league_size_request , int team_size_request ); - template<class MemorySpace> - TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request ); + TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/ /** \brief The actual league size (number of teams) of the policy. * @@ -574,12 +392,14 @@ class TeamPolicy: public typedef Impl::TeamPolicyInternal< typename Impl::PolicyTraits<Properties ... >::execution_space, Properties ...> internal_policy; + typedef Impl::PolicyTraits<Properties ... > traits; public: + typedef TeamPolicy execution_policy; TeamPolicy& operator = (const TeamPolicy&) = default; - + /** \brief Construct policy with the given instance of the execution space */ TeamPolicy( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 ) : internal_policy(typename traits::execution_space(),league_size_request,team_size_request, vector_length_request) {} @@ -594,13 +414,11 @@ public: TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 ) : internal_policy(league_size_request,Kokkos::AUTO(), vector_length_request) {} - template<class MemorySpace> - TeamPolicy( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request ) - : internal_policy(league_size_request,team_size_request, team_scratch_memory_request) {} +/* TeamPolicy( int league_size_request , int team_size_request ) + : internal_policy(league_size_request,team_size_request) {} - template<class MemorySpace> - TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request ) - : internal_policy(league_size_request,Kokkos::AUTO(), team_scratch_memory_request) {} + TeamPolicy( int league_size_request , const Kokkos::AUTO_t & ) + : internal_policy(league_size_request,Kokkos::AUTO()) {}*/ private: TeamPolicy(const internal_policy& p):internal_policy(p) {} @@ -744,6 +562,7 @@ Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange( } // namespace Kokkos + #endif /* #define KOKKOS_EXECPOLICY_HPP */ //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp index 6bef213b01..e02689b0f9 100644 --- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp @@ -120,21 +120,6 @@ public: //! This memory space preferred device_type typedef Kokkos::Device<execution_space,memory_space> device_type; - /*--------------------------------*/ -#if ! KOKKOS_USING_EXP_VIEW - - typedef Impl::HBWMallocAllocator allocator ; - - /** \brief Allocate a contiguous block of memory. - * - * The input label is associated with the block of memory. - * The block of memory is tracked via reference counting where - * allocation gives it a reference count of one. - */ - static Kokkos::Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size ); - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - /*--------------------------------*/ /* Functions unique to the HBWSpace */ static int in_parallel(); diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp index bea955cdd9..5fe686559a 100644 --- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -55,9 +55,6 @@ #include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_Error.hpp> -#include <impl/Kokkos_AllocationTracker.hpp> -#include <impl/Kokkos_BasicAllocators.hpp> - #include <impl/KokkosExp_SharedAlloc.hpp> /*--------------------------------------------------------------------------*/ @@ -128,25 +125,6 @@ public: //! This memory space preferred device_type typedef Kokkos::Device<execution_space,memory_space> device_type; - /*--------------------------------*/ -#if ! KOKKOS_USING_EXP_VIEW - -#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY ) - typedef Impl::PageAlignedAllocator allocator ; -#else - typedef Impl::AlignedAllocator allocator ; -#endif - - /** \brief Allocate a contiguous block of memory. - * - * The input label is associated with the block of memory. - * The block of memory is tracked via reference counting where - * allocation gives it a reference count of one. - */ - static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size ); - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - /*--------------------------------*/ /* Functions unique to the HostSpace */ static int in_parallel(); diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index 40a46b3022..7d1e59af5e 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -133,11 +133,23 @@ // still identifies as 7.0 #error "Cuda version 7.5 or greater required for host-to-device Lambda support" #endif +#if ( CUDA_VERSION < 8000 ) #define KOKKOS_LAMBDA [=]__device__ +#else +#define KOKKOS_LAMBDA [=]__host__ __device__ +#endif #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1 #endif #endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */ + +#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) + // Cuda version 8.0 still needs the functor wrapper + #if (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ ) + #define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER + #endif +#endif + /*--------------------------------------------------------------------------*/ /* Language info: C++, CUDA, OPENMP */ @@ -440,27 +452,16 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -/* Transitional macro to change between old and new View, - * default to use new View. +/* Transitional macro to change between old and new View + * are no longer supported. */ -#if ! defined( KOKKOS_USING_EXP_VIEW ) #if defined( KOKKOS_USING_DEPRECATED_VIEW ) -#define KOKKOS_USING_EXP_VIEW 0 -#else -#define KOKKOS_USING_EXP_VIEW 1 -#endif +#error "Kokkos deprecated View has been removed" #endif -#if KOKKOS_USING_EXP_VIEW -#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) +#define KOKKOS_USING_EXP_VIEW 1 #define KOKKOS_USING_EXPERIMENTAL_VIEW -#endif -#else /* ! KOKKOS_USING_EXP_VIEW */ -#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) -#error "KOKKOS_USING_EXP_VIEW and KOKKOS_USING_EXPERIMENAL_VIEW are both defined and are incompatible" -#endif -#endif //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp index 72d2a30560..d843f7c9a1 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -44,13 +44,16 @@ #ifndef KOKKOS_MEMORYPOOL_HPP #define KOKKOS_MEMORYPOOL_HPP -#include <vector> - #include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Parallel.hpp> +#include <Kokkos_Atomic.hpp> +#include <impl/Kokkos_BitOps.hpp> #include <impl/Kokkos_Error.hpp> #include <impl/KokkosExp_SharedAlloc.hpp> -#include <Kokkos_ExecPolicy.hpp> -#include <Kokkos_Atomic.hpp> + +#include <limits> +#include <algorithm> +#include <chrono> // How should errors be handled? In general, production code should return a // value indicating failure so the user can decide how the error is handled. @@ -60,516 +63,1431 @@ //#define KOKKOS_MEMPOOL_PRINTERR //#define KOKKOS_MEMPOOL_PRINT_INFO +//#define KOKKOS_MEMPOOL_PRINT_CONSTRUCTOR_INFO +//#define KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO +//#define KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO +//#define KOKKOS_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS +//#define KOKKOS_MEMPOOL_PRINT_PAGE_INFO +//#define KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO + +// A superblock is considered full when this percentage of its pages are full. +#define KOKKOS_MEMPOOL_SB_FULL_FRACTION 0.80 + +// A page is considered full when this percentage of its blocks are full. +#define KOKKOS_MEMPOOL_PAGE_FULL_FRACTION 0.875 // 28 / 32 //---------------------------------------------------------------------------- namespace Kokkos { namespace Experimental { -template < typename Space , typename ExecSpace = typename Space::execution_space > -class MemoryPool; +namespace MempoolImpl { -namespace Impl { +template < typename T, typename ExecutionSpace > +struct initialize_array { + typedef ExecutionSpace execution_space; + typedef typename ExecutionSpace::size_type size_type; -#ifdef KOKKOS_MEMPOOL_PRINT_INFO -template < typename MemPool > -struct print_mempool { - size_t m_num_chunk_sizes; - size_t * m_chunk_size; - uint64_t * m_freelist; - char * m_data; - - print_mempool( size_t ncs, size_t * cs, uint64_t * f, char * d ) - : m_num_chunk_sizes(ncs), m_chunk_size(cs), m_freelist(f), m_data(d) - {} + T * m_data; + T m_value; - KOKKOS_INLINE_FUNCTION - void operator()( size_t i ) const + initialize_array( T * d, size_t size, T v ) : m_data( d ), m_value( v ) { - if ( i == 0 ) { - printf( "*** ON DEVICE ***\n"); - printf( "m_chunk_size: 0x%llx\n", reinterpret_cast<uint64_t>( m_chunk_size ) ); - printf( " m_freelist: 0x%llx\n", reinterpret_cast<uint64_t>( m_freelist ) ); - printf( " m_data: 0x%llx\n", reinterpret_cast<uint64_t>( m_data ) ); - for ( size_t l = 0; l < m_num_chunk_sizes; ++l ) { - printf( "%2lu freelist: %10llu chunk_size: %6lu\n", - l, get_head_offset( m_freelist[l] ), m_chunk_size[l] ); - } - printf( " chunk_size: %6lu\n\n", - m_chunk_size[m_num_chunk_sizes] ); - } + Kokkos::parallel_for( size, *this ); + + execution_space::fence(); } - // This is only redefined here to avoid having to pass a MemPoolList object - // to the class. KOKKOS_INLINE_FUNCTION - uint64_t get_head_offset(uint64_t head) const - { return ( head >> MemPool::TAGBITS ) << MemPool::LG_MIN_CHUNKSIZE; } + void operator()( size_type i ) const { m_data[i] = m_value; } }; -#endif -template < typename MemPool > -struct initialize_mempool { - char * m_data; - size_t m_chunk_size; - size_t m_last_chunk; - size_t m_base_offset; +template <typename Bitset> +struct bitset_count +{ + typedef typename Bitset::execution_space execution_space; + typedef typename execution_space::size_type size_type; + typedef typename Bitset::size_type value_type; + typedef typename Bitset::word_type word_type; + + word_type * m_words; + value_type & m_result; + + bitset_count( word_type * w, value_type num_words, value_type & r ) + : m_words( w ), m_result( r ) + { + parallel_reduce( num_words, *this, m_result ); + } - initialize_mempool( char * d, size_t cs, size_t lc, size_t bo ) - : m_data(d), m_chunk_size(cs), m_last_chunk(lc), m_base_offset(bo) - {} + KOKKOS_INLINE_FUNCTION + void init( value_type & v ) const + { v = 0; } KOKKOS_INLINE_FUNCTION - void operator()( size_t i ) const + void join( volatile value_type & dst, volatile value_type const & src ) const + { dst += src; } + + KOKKOS_INLINE_FUNCTION + void operator()( size_type i, value_type & count) const { - uint64_t * lp = - reinterpret_cast<uint64_t *>( m_data + m_base_offset + i * m_chunk_size ); - - // All entries in the list point to the next entry except the last which - // uses a reserved value to indicate the end of the list. The offset from - // the base pointer is stored in increments of the minimum chunk size. - *lp = i < m_last_chunk ? - m_base_offset + (i + 1) * m_chunk_size : - MemPool::FREELIST_END; + count += Kokkos::Impl::bit_count( m_words[i] ); } }; -class MemPoolList { -private: - - typedef Impl::SharedAllocationTracker Tracker; +template < typename Device > +class Bitset { +public: + typedef typename Device::execution_space execution_space; + typedef typename Device::memory_space memory_space; + typedef unsigned word_type; + typedef unsigned size_type; - template < typename , typename > friend class Kokkos::Experimental::MemoryPool; - template < typename > friend struct initialize_mempool; -#ifdef KOKKOS_MEMPOOL_PRINT_INFO - template < typename > friend struct print_mempool; -#endif + typedef Kokkos::Impl::DeepCopy< memory_space, Kokkos::HostSpace > raw_deep_copy; // Define some constants. enum { - // The head of a freelist is a 64 bit unsigned interger. We divide it - // into 2 pieces. The upper (64-TAGBITS) bits is the offset from the base - // data pointer of the allocator in increments of the minimum chunk size. - // The lower TAGBITS bits is the tag used to prevent ABA problems. The - // largest two values that fit in the offset portion are reserved to - // represent the end of the freelist and that the freelist is locked. - // - // Using 32 bits for both the tag and offset and with a minimum chunk size - // of 128 bytes, the offset can address 549755813632 bytes (app. 512 GB) - // of memory. This should be more than enough to address the whole address - // space of a GPU or MIC for the foreseeable future. - TAGBITS = 32, - MIN_CHUNKSIZE = 128, - - TAGBITS_MASK = ( uint64_t( 1 ) << TAGBITS ) - 1, - LG_MIN_CHUNKSIZE = Kokkos::Impl::integral_power_of_two(MIN_CHUNKSIZE), - - // The largest two values of the offset are reserved to indicate the end of a - // freelist (2^TAGBITS - 2) and that the freelist is locked (2^TAGBITS - 1). - // They are shifted so they can be compared directly to the result of - // get_head_offset(). - FREELIST_END = uint64_t( TAGBITS_MASK - 1 ) << LG_MIN_CHUNKSIZE, - FREELIST_LOCK = uint64_t( TAGBITS_MASK ) << LG_MIN_CHUNKSIZE, - - // This is the head value for a locked freelist. It uses the lock value for - // the offset and 0 for the tagbits. - FREELIST_LOCK_HEAD = uint64_t( TAGBITS_MASK ) << TAGBITS + // Size of bitset word. Should be 32. + WORD_SIZE = sizeof(word_type) * CHAR_BIT, + LG_WORD_SIZE = Kokkos::Impl::integral_power_of_two( WORD_SIZE ), + WORD_MASK = WORD_SIZE - 1 }; - Tracker m_track; +private: + word_type * m_words; + size_type m_size; + size_type m_num_words; + word_type m_last_word_mask; - // These three variables are pointers into device memory. - size_t * m_chunk_size; // Array of chunk sizes of freelists. - uint64_t * m_freelist; // Array of freelist heads. - char * m_data; // Beginning memory location used for chunks. +public: + ~Bitset() = default; + Bitset() = default; + Bitset( Bitset && ) = default; + Bitset( const Bitset & ) = default; + Bitset & operator = ( Bitset && ) = default; + Bitset & operator = ( const Bitset & ) = default; + + void init( void * w, size_type s ) + { + // Assumption: The size of the memory pointed to by w is a multiple of + // sizeof(word_type). - size_t m_data_size; - size_t m_chunk_spacing; + m_words = reinterpret_cast<word_type*>( w ); + m_size = s; + m_num_words = ( s + WORD_SIZE - 1 ) >> LG_WORD_SIZE; + m_last_word_mask = m_size & WORD_MASK ? ( word_type(1) << ( m_size & WORD_MASK ) ) - 1 : 0; -#if defined(KOKKOS_MEMPOOL_PRINT_INFO) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) - static long m_count; -#endif + reset(); + } - ~MemPoolList() = default; - MemPoolList() = default; - MemPoolList( MemPoolList && ) = default; - MemPoolList( const MemPoolList & ) = default; - MemPoolList & operator = ( MemPoolList && ) = default; - MemPoolList & operator = ( const MemPoolList & ) = default; + size_type size() const { return m_size; } - template < typename MemorySpace, typename ExecutionSpace > - inline - MemPoolList( const MemorySpace & memspace, const ExecutionSpace &, - size_t arg_base_chunk_size, size_t arg_total_size, - size_t num_chunk_sizes, size_t chunk_spacing ) - : m_track(), m_chunk_size(0), m_freelist(0), m_data(0), m_data_size(0), - m_chunk_spacing(chunk_spacing) + size_type count() const { - static_assert( sizeof(size_t) <= sizeof(void*), "" ); + size_type val; + bitset_count< Bitset > bc( m_words, m_num_words, val ); + return val; + } - typedef Impl::SharedAllocationRecord< MemorySpace, void > SharedRecord; - typedef Kokkos::RangePolicy< ExecutionSpace > Range; + void set() + { + // Set all the bits. + initialize_array< word_type, execution_space > ia( m_words, m_num_words, ~word_type(0) ); - size_t base_chunk_size = arg_base_chunk_size; + if ( m_last_word_mask ) { + // Clear the unused bits in the last block. + raw_deep_copy( m_words + ( m_num_words - 1 ), &m_last_word_mask, sizeof(word_type) ); + } + } - // The base chunk size must be at least MIN_CHUNKSIZE bytes as this is the - // cache-line size for NVIDA GPUs. - if ( base_chunk_size < MIN_CHUNKSIZE ) { + void reset() + { + initialize_array< word_type, execution_space > ia( m_words, m_num_words, word_type(0) ); + } -#ifdef KOKKOS_MEMPOOL_PRINT_INFO - printf( "** Chunk size must be at least %u bytes. Setting to %u. **\n", - MIN_CHUNKSIZE, MIN_CHUNKSIZE); - fflush( stdout ); -#endif + KOKKOS_FORCEINLINE_FUNCTION + bool test( size_type i ) const + { + size_type word_pos = i >> LG_WORD_SIZE; + word_type word = volatile_load( &m_words[ word_pos ] ); + word_type mask = word_type(1) << ( i & WORD_MASK ); + + return word & mask; + } + + KOKKOS_FORCEINLINE_FUNCTION + bool set( size_type i ) const + { + size_type word_pos = i >> LG_WORD_SIZE; + word_type mask = word_type(1) << ( i & WORD_MASK ); + + return !( atomic_fetch_or( &m_words[ word_pos ], mask ) & mask ); + } + + KOKKOS_FORCEINLINE_FUNCTION + bool reset( size_type i ) const + { + size_type word_pos = i >> LG_WORD_SIZE; + word_type mask = word_type(1) << ( i & WORD_MASK ); + + return atomic_fetch_and( &m_words[ word_pos ], ~mask ) & mask; + } + + KOKKOS_FORCEINLINE_FUNCTION + Kokkos::pair< bool, word_type > + fetch_word_reset( size_type i ) const + { + size_type word_pos = i >> LG_WORD_SIZE; + word_type mask = word_type(1) << ( i & WORD_MASK ); + + Kokkos::pair<bool, word_type> result; + result.second = atomic_fetch_and( &m_words[ word_pos ], ~mask ); + result.first = result.second & mask; + + return result; + } + + KOKKOS_FORCEINLINE_FUNCTION + Kokkos::pair< bool, size_type > + set_any_in_word( size_type i, word_type & prev_val ) const + { + prev_val = 0; - base_chunk_size = MIN_CHUNKSIZE; + size_type word_pos = i >> LG_WORD_SIZE; + word_type word = volatile_load( &m_words[ word_pos ] ); + + // Loop until there are no more unset bits in the word. + while ( ~word ) { + // Find the first unset bit in the word. + size_type bit = Kokkos::Impl::bit_scan_forward( ~word ); + + // Try to set the bit. + word_type mask = word_type(1) << bit; + word = atomic_fetch_or( &m_words[ word_pos ], mask ); + + if ( !( word & mask ) ) { + // Successfully set the bit. + prev_val = word; + + return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit ); + } } - // The base chunk size must also be a multiple of MIN_CHUNKSIZE bytes for - // correct memory alignment of the chunks. If it isn't a multiple of - // MIN_CHUNKSIZE, set it to the smallest multiple of MIN_CHUNKSIZE - // greater than the given chunk size. - if ( base_chunk_size % MIN_CHUNKSIZE != 0 ) { - size_t old_chunk_size = base_chunk_size; - base_chunk_size = ( ( old_chunk_size + MIN_CHUNKSIZE - 1 ) / MIN_CHUNKSIZE ) * - MIN_CHUNKSIZE; + // Didn't find a free bit in this word. + return Kokkos::pair<bool, size_type>( false, i ); + } -#ifdef KOKKOS_MEMPOOL_PRINT_INFO - printf( "** Chunk size must be a multiple of %u bytes. Given: %lu Using: %lu. **\n", - MIN_CHUNKSIZE, old_chunk_size, base_chunk_size); - fflush( stdout ); -#endif + KOKKOS_FORCEINLINE_FUNCTION + Kokkos::pair< bool, size_type > + set_any_in_word( size_type i, word_type & prev_val, word_type word_mask ) const + { + prev_val = 0; + + size_type word_pos = i >> LG_WORD_SIZE; + word_type word = volatile_load( &m_words[ word_pos ] ); + word = ( ~word ) & word_mask; + // Loop until there are no more unset bits in the word. + while ( word ) { + // Find the first unset bit in the word. + size_type bit = Kokkos::Impl::bit_scan_forward( word ); + + // Try to set the bit. + word_type mask = word_type(1) << bit; + word = atomic_fetch_or( &m_words[ word_pos ], mask ); + + if ( !( word & mask ) ) { + // Successfully set the bit. + prev_val = word; + + return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit ); + } + + word = ( ~word ) & word_mask; } - // Force total_size to be a multiple of base_chunk_size. - // Preserve the number of chunks originally requested. - size_t total_size = base_chunk_size * - ( ( arg_total_size + arg_base_chunk_size - 1 ) / arg_base_chunk_size ); + // Didn't find a free bit in this word. + return Kokkos::pair<bool, size_type>( false, i ); + } + + KOKKOS_FORCEINLINE_FUNCTION + Kokkos::pair< bool, size_type > + reset_any_in_word( size_type i, word_type & prev_val ) const + { + prev_val = 0; + + size_type word_pos = i >> LG_WORD_SIZE; + word_type word = volatile_load( &m_words[ word_pos ] ); - m_data_size = total_size; + // Loop until there are no more set bits in the word. + while ( word ) { + // Find the first unset bit in the word. + size_type bit = Kokkos::Impl::bit_scan_forward( word ); - // Get the chunk size for the largest possible chunk. - // max_chunk_size = - // base_chunk_size * (m_chunk_spacing ^ (num_chunk_sizes - 1)) - size_t max_chunk_size = base_chunk_size; - for (size_t i = 1; i < num_chunk_sizes; ++i) { - max_chunk_size *= m_chunk_spacing; + // Try to reset the bit. + word_type mask = word_type(1) << bit; + word = atomic_fetch_and( &m_words[ word_pos ], ~mask ); + + if ( word & mask ) { + // Successfully reset the bit. + prev_val = word; + + return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit ); + } } - // We want each chunk size to use total_size / num_chunk_sizes memory. If - // the total size of the pool is not enough to accomodate this, keep making - // the next lower chunk size the max_chunk_size until it is. - while ( max_chunk_size > total_size / num_chunk_sizes ) { - max_chunk_size /= m_chunk_spacing; - --num_chunk_sizes; + // Didn't find a free bit in this word. + return Kokkos::pair<bool, size_type>( false, i ); + } + + KOKKOS_FORCEINLINE_FUNCTION + Kokkos::pair< bool, size_type > + reset_any_in_word( size_type i, word_type & prev_val, word_type word_mask ) const + { + prev_val = 0; + + size_type word_pos = i >> LG_WORD_SIZE; + word_type word = volatile_load( &m_words[ word_pos ] ); + word = word & word_mask; + + // Loop until there are no more set bits in the word. + while ( word ) { + // Find the first unset bit in the word. + size_type bit = Kokkos::Impl::bit_scan_forward( word ); + + // Try to reset the bit. + word_type mask = word_type(1) << bit; + word = atomic_fetch_and( &m_words[ word_pos ], ~mask ); + + if ( word & mask ) { + // Successfully reset the bit. + prev_val = word; + + return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit ); + } + + word = word & word_mask; } - // We put a header at the beginnig of the device memory and use extra - // chunks to store the header. The header contains: - // size_t chunk_size[num_chunk_sizes+1] - // uint64_t freelist[num_chunk_sizes] + // Didn't find a free bit in this word. + return Kokkos::pair<bool, size_type>( false, i ); + } +}; - // Calculate the size of the header where the size is rounded up to the - // smallest multiple of base_chunk_size >= the needed size. The size of the - // chunk size array is calculated using sizeof(void*) to guarantee alignment - // for the freelist array. This assumes sizeof(size_t) <= sizeof(void*). - size_t header_bytes = ( 2 * num_chunk_sizes + 1 ) * sizeof(void*); - size_t header_size = - ( header_bytes + base_chunk_size - 1 ) / base_chunk_size * base_chunk_size; +template < typename UInt32View, typename BSHeaderView, typename SBHeaderView, + typename MempoolBitset > +struct create_histogram { + typedef typename UInt32View::execution_space execution_space; + typedef typename execution_space::size_type size_type; + typedef Kokkos::pair< double, uint32_t > value_type; + + size_t m_start; + UInt32View m_page_histogram; + BSHeaderView m_blocksize_info; + SBHeaderView m_sb_header; + MempoolBitset m_sb_blocks; + size_t m_lg_max_sb_blocks; + uint32_t m_lg_min_block_size; + uint32_t m_blocks_per_page; + value_type & m_result; + + create_histogram( size_t start, size_t end, UInt32View ph, BSHeaderView bsi, + SBHeaderView sbh, MempoolBitset sbb, size_t lmsb, + uint32_t lmbs, uint32_t bpp, value_type & r ) + : m_start( start ), m_page_histogram( ph ), m_blocksize_info( bsi ), + m_sb_header( sbh ), m_sb_blocks( sbb ), m_lg_max_sb_blocks( lmsb ), + m_lg_min_block_size( lmbs ), m_blocks_per_page( bpp ), m_result( r ) + { + Kokkos::parallel_reduce( end - start, *this, m_result ); - // Allocate the memory including the header. - size_t alloc_size = total_size + header_size; + execution_space::fence(); + } -#ifdef KOKKOS_MEMPOOL_PRINT_INFO - printf( "** Allocating total %ld bytes\n", long(alloc_size)); - fflush( stdout ); -#endif + KOKKOS_INLINE_FUNCTION + void init( value_type & v ) const + { + v.first = 0.0; + v.second = 0; + } - SharedRecord * rec = - SharedRecord::allocate( memspace, "mempool", alloc_size ); + KOKKOS_INLINE_FUNCTION + void join( volatile value_type & dst, volatile value_type const & src ) const + { + dst.first += src.first; + dst.second += src.second; + } -#ifdef KOKKOS_MEMPOOL_PRINT_INFO - printf( "** Allocated total %ld bytes at 0x%lx\n", - long(alloc_size), long(rec->data()) ); - fflush( stdout ); -#endif + KOKKOS_INLINE_FUNCTION + void operator()( size_type i, value_type & r ) const + { + size_type i2 = i + m_start; - m_track.assign_allocated_record_to_uninitialized( rec ); + uint32_t lg_block_size = m_sb_header(i2).m_lg_block_size; - { - // Get the pointers into the allocated memory. - char * mem = reinterpret_cast<char *>( rec->data() ); - m_chunk_size = reinterpret_cast<size_t *>( mem ); - m_freelist = reinterpret_cast<uint64_t *>( - mem + ( num_chunk_sizes + 1 ) * sizeof(void*) ); - m_data = mem + header_size; + // A superblock only has a block size of 0 when it is empty. + if ( lg_block_size != 0 ) { + uint32_t block_size_id = lg_block_size - m_lg_min_block_size; + uint32_t blocks_per_sb = m_blocksize_info[block_size_id].m_blocks_per_sb; + uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb; -#ifdef KOKKOS_MEMPOOL_PRINT_INFO - printf( "** Partitioning allocation 0x%lx : m_chunk_size[0x%lx] m_freelist[0x%lx] m_data[0x%lx]\n", - (unsigned long) mem, (unsigned long) m_chunk_size, - (unsigned long) m_freelist, (unsigned long) m_data ); - fflush( stdout ); -#endif + uint32_t total_allocated_blocks = 0; + + for ( uint32_t j = 0; j < pages_per_sb; ++j ) { + unsigned start_pos = ( i2 << m_lg_max_sb_blocks ) + j * m_blocks_per_page; + unsigned end_pos = start_pos + m_blocks_per_page; + uint32_t page_allocated_blocks = 0; + + for ( unsigned k = start_pos; k < end_pos; ++k ) { + page_allocated_blocks += m_sb_blocks.test( k ); + } + + total_allocated_blocks += page_allocated_blocks; + + atomic_fetch_add( &m_page_histogram(page_allocated_blocks), 1 ); + } + + r.first += double(total_allocated_blocks) / blocks_per_sb; + r.second += blocks_per_sb; } + } +}; + +#ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO +template < typename UInt32View, typename SBHeaderView, typename MempoolBitset > +struct count_allocated_blocks { + typedef typename UInt32View::execution_space execution_space; + typedef typename execution_space::size_type size_type; + + UInt32View m_num_allocated_blocks; + SBHeaderView m_sb_header; + MempoolBitset m_sb_blocks; + size_t m_sb_size; + size_t m_lg_max_sb_blocks; + + count_allocated_blocks( size_t num_sb, UInt32View nab, SBHeaderView sbh, + MempoolBitset sbb, size_t sbs, size_t lmsb ) + : m_num_allocated_blocks( nab ), m_sb_header( sbh ), + m_sb_blocks( sbb ), m_sb_size( sbs ), m_lg_max_sb_blocks( lmsb ) + { + Kokkos::parallel_for( num_sb, *this ); + + execution_space::fence(); + } - // Initialize the chunk sizes array. Create num_chunk_sizes different - // chunk sizes where each successive chunk size is - // m_chunk_spacing * previous chunk size. The last entry in the array is - // 0 and is used for a stopping condition. - m_chunk_size[0] = base_chunk_size; - for ( size_t i = 1; i < num_chunk_sizes; ++i ) { - m_chunk_size[i] = m_chunk_size[i - 1] * m_chunk_spacing; + KOKKOS_INLINE_FUNCTION + void operator()( size_type i ) const + { + uint32_t lg_block_size = m_sb_header(i).m_lg_block_size; + + // A superblock only has a block size of 0 when it is empty. + if ( lg_block_size != 0 ) { + // Count the allocated blocks in the superblock. + uint32_t blocks_per_sb = lg_block_size > 0 ? m_sb_size >> lg_block_size : 0; + unsigned start_pos = i << m_lg_max_sb_blocks; + unsigned end_pos = start_pos + blocks_per_sb; + uint32_t count = 0; + + for ( unsigned j = start_pos; j < end_pos; ++j ) { + count += m_sb_blocks.test( j ); + } + + m_num_allocated_blocks(i) = count; } - m_chunk_size[num_chunk_sizes] = 0; + } +}; +#endif - std::vector<size_t> num_chunks(num_chunk_sizes); +} - // Set the starting point in memory and get the number of chunks for each - // freelist. Start with the largest chunk size to ensure usage of all the - // memory. If there is leftover memory for a chunk size, it will be used - // by a smaller chunk size. - size_t used_memory = 0; - for ( size_t i = num_chunk_sizes; i > 0; --i ) { - // Set the starting position in the memory for the current chunk sizes's - // freelist and initialize the tag to 0. - m_freelist[i - 1] = create_head( used_memory, 0UL ); +/// \class MemoryPool +/// \brief Bitset based memory manager for pools of same-sized chunks of memory. +/// \tparam Device Kokkos device that gives the execution and memory space the +/// allocator will be used in. +/// +/// MemoryPool is a memory space that can be on host or device. It provides a +/// pool memory allocator for fast allocation of same-sized chunks of memory. +/// The memory is only accessible on the host / device this allocator is +/// associated with. +/// +/// This allocator is based on ideas from the following GPU allocators: +/// Halloc (https://github.com/canonizer/halloc). +/// ScatterAlloc (https://github.com/ComputationalRadiationPhysics/scatteralloc) +template < typename Device > +class MemoryPool { +private: + // The allocator uses superblocks. A superblock is divided into pages, and a + // page is divided into blocks. A block is the chunk of memory that is given + // out by the allocator. A page always has a number of blocks equal to the + // size of the word used by the bitset. Thus, the pagesize can vary between + // superblocks as it is based on the block size of the superblock. The + // allocator supports all powers of 2 from MIN_BLOCK_SIZE to the size of a + // superblock as block sizes. + + // Superblocks are divided into 4 categories: + // 1. empty - is completely empty; there are no active allocations + // 2. partfull - partially full; there are some active allocations + // 3. full - full enough with active allocations that new allocations + // will likely fail + // 4. active - is currently the active superblock for a block size + // + // An inactive superblock is one that is empty, partfull, or full. + // + // New allocations occur only from an active superblock. If a superblock is + // made inactive after an allocation request is made to it but before the + // allocation request is fulfilled, the allocation will still be attempted + // from that superblock. Deallocations can occur to partfull, full, or + // active superblocks. Superblocks move between categories as allocations + // and deallocations happen. Superblocks all start empty. + // + // Here are the possible moves between categories: + // empty -> active During allocation, there is no active superblock + // or the active superblock is full. + // active -> full During allocation, the full threshold of the + // superblock is reached when increasing the fill + // level. + // full -> partfull During deallocation, the full threshold of the + // superblock is crossed when decreasing the fill + // level. + // partfull -> empty Deallocation of the last allocated block of an + // inactive superblock. + // partfull -> active During allocation, the active superblock is full. + // + // When a new active superblock is needed, partfull superblocks of the same + // block size are chosen over empty superblocks. + // + // The empty and partfull superblocks are tracked using bitsets that represent + // the superblocks in those repsective categories. Empty superblocks use a + // single bitset, while partfull superblocks use a bitset per block size + // (contained sequentially in a single bitset). Active superblocks are + // tracked by the active superblocks array. Full superblocks aren't tracked + // at all. + + typedef typename Device::execution_space execution_space; + typedef typename Device::memory_space backend_memory_space; + typedef Device device_type; + typedef MempoolImpl::Bitset< device_type > MempoolBitset; - size_t mem_avail = - total_size - (i - 1) * ( total_size / num_chunk_sizes ) - used_memory; + // Define some constants. + enum { + MIN_BLOCK_SIZE = 64, + LG_MIN_BLOCK_SIZE = Kokkos::Impl::integral_power_of_two( MIN_BLOCK_SIZE ), + MAX_BLOCK_SIZES = 31 - LG_MIN_BLOCK_SIZE + 1, - // Set the number of chunks for the current chunk sizes's freelist. - num_chunks[i - 1] = mem_avail / m_chunk_size[i - 1]; + // Size of bitset word. + BLOCKS_PER_PAGE = MempoolBitset::WORD_SIZE, + LG_BLOCKS_PER_PAGE = MempoolBitset::LG_WORD_SIZE, - used_memory += num_chunks[i - 1] * m_chunk_size[i - 1]; - } + INVALID_SUPERBLOCK = ~uint32_t(0), + SUPERBLOCK_LOCK = ~uint32_t(0) - 1, -#ifdef KOKKOS_MEMPOOL_PRINT_INFO - printf( "\n" ); - printf( "*** ON HOST ***\n"); - printf( "m_chunk_size: 0x%llx\n", reinterpret_cast<uint64_t>( m_chunk_size ) ); - printf( " m_freelist: 0x%llx\n", reinterpret_cast<uint64_t>( m_freelist ) ); - printf( " m_data: 0x%llx\n", reinterpret_cast<uint64_t>( m_data ) ); - for ( size_t i = 0; i < num_chunk_sizes; ++i ) { - printf( "%2lu freelist: %10llu chunk_size: %6lu num_chunks: %8lu\n", - i, get_head_offset( m_freelist[i] ), m_chunk_size[i], num_chunks[i] ); - } - printf( " chunk_size: %6lu\n\n", - m_chunk_size[num_chunk_sizes] ); - fflush( stdout ); + MAX_TRIES = 32 // Cap on the number of pages searched + // before an allocation returns empty. + }; + +public: + // Stores information about each superblock. + struct SuperblockHeader { + uint32_t m_full_pages; + uint32_t m_empty_pages; + uint32_t m_lg_block_size; + uint32_t m_is_active; + + KOKKOS_FUNCTION + SuperblockHeader() : + m_full_pages(0), m_empty_pages(0), m_lg_block_size(0), m_is_active(false) {} + }; + + // Stores information about each block size. + struct BlockSizeHeader { + uint32_t m_blocks_per_sb; + uint32_t m_pages_per_sb; + uint32_t m_sb_full_level; + uint32_t m_page_full_level; + + KOKKOS_FUNCTION + BlockSizeHeader() : + m_blocks_per_sb(0), m_pages_per_sb(0), m_sb_full_level(0), m_page_full_level(0) {} + }; + +private: + typedef Impl::SharedAllocationTracker Tracker; + typedef View< uint32_t *, device_type > UInt32View; + typedef View< SuperblockHeader *, device_type > SBHeaderView; + + // The letters 'sb' used in any variable name mean superblock. + + size_t m_lg_sb_size; // Log2 of superblock size. + size_t m_sb_size; // Superblock size. + size_t m_lg_max_sb_blocks; // Log2 of the number of blocks of the + // minimum block size in a superblock. + size_t m_num_sb; // Number of superblocks. + size_t m_ceil_num_sb; // Number of superblocks rounded up to the smallest + // multiple of the bitset word size. Used by + // bitsets representing superblock categories to + // ensure different block sizes never share a word + // in the bitset. + size_t m_num_block_size; // Number of block sizes supported. + size_t m_data_size; // Amount of memory available to the allocator. + size_t m_sb_blocks_size; // Amount of memory for free / empty blocks bitset. + size_t m_empty_sb_size; // Amount of memory for empty superblocks bitset. + size_t m_partfull_sb_size; // Amount of memory for partfull superblocks bitset. + size_t m_total_size; // Total amount of memory allocated. + char * m_data; // Beginning device memory location used for + // superblocks. + UInt32View m_active; // Active superblocks IDs. + SBHeaderView m_sb_header; // Header info for superblocks. + MempoolBitset m_sb_blocks; // Bitsets representing free / allocated status + // of blocks in superblocks. + MempoolBitset m_empty_sb; // Bitset representing empty superblocks. + MempoolBitset m_partfull_sb; // Bitsets representing partially full superblocks. + Tracker m_track; // Tracker for superblock memory. + BlockSizeHeader m_blocksize_info[MAX_BLOCK_SIZES]; // Header info for block sizes. + + // There were several methods tried for storing the block size header info: in a View, + // in a View of const data, and in a RandomAccess View. All of these were slower than + // storing it in a static array that is a member variable to the class. In the latter + // case, the block size info gets copied into the constant memory on the GPU along with + // the class when it is copied there for exeucting a parallel loop. Instead of storing + // the values, computing the values every time they were needed was also tried. This + // method was slightly slower than storing them in the static array. + +public: + //! Tag this class as a kokkos memory space + typedef MemoryPool memory_space; + + ~MemoryPool() = default; + MemoryPool() = default; + MemoryPool( MemoryPool && ) = default; + MemoryPool( const MemoryPool & ) = default; + MemoryPool & operator = ( MemoryPool && ) = default; + MemoryPool & operator = ( const MemoryPool & ) = default; + + /// \brief Initializes the memory pool. + /// \param memspace The memory space from which the memory pool will allocate memory. + /// \param total_size The requested memory amount controlled by the allocator. The + /// actual amount is rounded up to the smallest multiple of the + /// superblock size >= the requested size. + /// \param log2_superblock_size Log2 of the size of superblocks used by the allocator. + /// In most use cases, the default value should work. + inline + MemoryPool( const backend_memory_space & memspace, + size_t total_size, size_t log2_superblock_size = 20 ) + : m_lg_sb_size( log2_superblock_size ), + m_sb_size( size_t(1) << m_lg_sb_size ), + m_lg_max_sb_blocks( m_lg_sb_size - LG_MIN_BLOCK_SIZE ), + m_num_sb( ( total_size + m_sb_size - 1 ) >> m_lg_sb_size ), + m_ceil_num_sb( ( ( m_num_sb + BLOCKS_PER_PAGE - 1 ) >> LG_BLOCKS_PER_PAGE ) << + LG_BLOCKS_PER_PAGE ), + m_num_block_size( m_lg_sb_size - LG_MIN_BLOCK_SIZE + 1 ), + m_data_size( m_num_sb * m_sb_size ), + m_sb_blocks_size( ( m_num_sb << m_lg_max_sb_blocks ) / CHAR_BIT ), + m_empty_sb_size( m_ceil_num_sb / CHAR_BIT ), + m_partfull_sb_size( m_ceil_num_sb * m_num_block_size / CHAR_BIT ), + m_total_size( m_data_size + m_sb_blocks_size + m_empty_sb_size + m_partfull_sb_size ), + m_data(0), + m_active( "Active superblocks" ), + m_sb_header( "Superblock headers" ), + m_track() + { + // Assumption. The minimum block size must be a power of 2. + static_assert( Kokkos::Impl::is_integral_power_of_two( MIN_BLOCK_SIZE ), "" ); + + // Assumption. Require a superblock be large enough so it takes at least 1 + // whole bitset word to represent it using the minimum blocksize. + if ( m_sb_size < MIN_BLOCK_SIZE * BLOCKS_PER_PAGE ) { + printf( "\n** MemoryPool::MemoryPool() Superblock size must be >= %u **\n", + MIN_BLOCK_SIZE * BLOCKS_PER_PAGE ); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + fflush( stdout ); #endif + Kokkos::abort( "" ); + } -#ifdef KOKKOS_MEMPOOL_PRINTERR - if ( used_memory != total_size ) { - printf( "\n** MemoryPool::MemoryPool() USED_MEMORY(%lu) != TOTAL_SIZE(%lu) **\n", - used_memory, total_size ); + // Assumption. A superblock's size can be at most 2^31. Verify this. + if ( m_lg_sb_size > 31 ) { + printf( "\n** MemoryPool::MemoryPool() Superblock size must be < %u **\n", + ( uint32_t(1) << 31 ) ); #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST fflush( stdout ); #endif Kokkos::abort( "" ); } + + // Assumption. The Bitset only uses unsigned for size types which limits + // the amount of memory the allocator can manage. Verify the memory size + // is below this limit. + if ( m_data_size > size_t(MIN_BLOCK_SIZE) * std::numeric_limits<unsigned>::max() ) { + printf( "\n** MemoryPool::MemoryPool() Allocator can only manage %lu bytes of memory; requested %lu **\n", + size_t(MIN_BLOCK_SIZE) * std::numeric_limits<unsigned>::max(), total_size ); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + fflush( stdout ); #endif + Kokkos::abort( "" ); + } + + // Allocate memory for Views. This is done here instead of at construction + // so that the runtime checks can be performed before allocating memory. + resize(m_active, m_num_block_size ); + resize(m_sb_header, m_num_sb ); - // Create the chunks for each freelist. - for ( size_t i = 0; i < num_chunk_sizes; ++i ) { - // Initialize the next pointers to point to the next chunk for all but the - // last chunk which uses a reserved value to indicate the end of the list. - initialize_mempool<MemPoolList> im( m_data, m_chunk_size[i], num_chunks[i] - 1, - get_head_offset( m_freelist[i] ) ); + // Allocate superblock memory. + typedef Impl::SharedAllocationRecord< backend_memory_space, void > SharedRecord; + SharedRecord * rec = + SharedRecord::allocate( memspace, "mempool", m_total_size ); - Kokkos::Impl::ParallelFor< initialize_mempool<MemPoolList>, Range > - closure( im, Range( 0, num_chunks[i] ) ); + m_track.assign_allocated_record_to_uninitialized( rec ); + m_data = reinterpret_cast<char *>( rec->data() ); - closure.execute(); + // Set and initialize the free / empty block bitset memory. + m_sb_blocks.init( m_data + m_data_size, m_num_sb << m_lg_max_sb_blocks ); - ExecutionSpace::fence(); - } + // Set and initialize the empty superblock block bitset memory. + m_empty_sb.init( m_data + m_data_size + m_sb_blocks_size, m_num_sb ); -#ifdef KOKKOS_MEMPOOL_PRINT_INFO - print_mempool<MemPoolList> pm( num_chunk_sizes, m_chunk_size, m_freelist, m_data ); + // Start with all superblocks in the empty category. + m_empty_sb.set(); + + // Set and initialize the partfull superblock block bitset memory. + m_partfull_sb.init( m_data + m_data_size + m_sb_blocks_size + m_empty_sb_size, + m_ceil_num_sb * m_num_block_size ); + + // Initialize all active superblocks to be invalid. + typename UInt32View::HostMirror host_active = create_mirror_view(m_active); + for (size_t i = 0; i < m_num_block_size; ++i) host_active(i) = INVALID_SUPERBLOCK; + + deep_copy(m_active, host_active); + + // Initialize the blocksize info. + for ( size_t i = 0; i < m_num_block_size; ++i ) { + uint32_t lg_block_size = i + LG_MIN_BLOCK_SIZE; + uint32_t blocks_per_sb = m_sb_size >> lg_block_size; + uint32_t pages_per_sb = ( blocks_per_sb + BLOCKS_PER_PAGE - 1 ) >> LG_BLOCKS_PER_PAGE; + + m_blocksize_info[i].m_blocks_per_sb = blocks_per_sb; + m_blocksize_info[i].m_pages_per_sb = pages_per_sb; + + // Set the full level for the superblock. + m_blocksize_info[i].m_sb_full_level = + static_cast<uint32_t>( pages_per_sb * KOKKOS_MEMPOOL_SB_FULL_FRACTION ); + + if ( m_blocksize_info[i].m_sb_full_level == 0 ) { + m_blocksize_info[i].m_sb_full_level = 1; + } - Kokkos::Impl::ParallelFor< print_mempool<MemPoolList>, Range > - closure( pm, Range( 0, 10 ) ); + // Set the full level for the page. + uint32_t blocks_per_page = + blocks_per_sb < BLOCKS_PER_PAGE ? blocks_per_sb : BLOCKS_PER_PAGE; - closure.execute(); + m_blocksize_info[i].m_page_full_level = + static_cast<uint32_t>( blocks_per_page * KOKKOS_MEMPOOL_PAGE_FULL_FRACTION ); - ExecutionSpace::fence(); + if ( m_blocksize_info[i].m_page_full_level == 0 ) { + m_blocksize_info[i].m_page_full_level = 1; + } + } + +#ifdef KOKKOS_MEMPOOL_PRINT_CONSTRUCTOR_INFO + printf( "\n" ); + printf( " m_lg_sb_size: %12lu\n", m_lg_sb_size ); + printf( " m_sb_size: %12lu\n", m_sb_size ); + printf( " m_max_sb_blocks: %12lu\n", size_t(1) << m_lg_max_sb_blocks ); + printf( "m_lg_max_sb_blocks: %12lu\n", m_lg_max_sb_blocks ); + printf( " m_num_sb: %12lu\n", m_num_sb ); + printf( " m_ceil_num_sb: %12lu\n", m_ceil_num_sb ); + printf( " m_num_block_size: %12lu\n", m_num_block_size ); + printf( " data bytes: %12lu\n", m_data_size ); + printf( " sb_blocks bytes: %12lu\n", m_sb_blocks_size ); + printf( " empty_sb bytes: %12lu\n", m_empty_sb_size ); + printf( " partfull_sb bytes: %12lu\n", m_partfull_sb_size ); + printf( " total bytes: %12lu\n", m_total_size ); + printf( " m_empty_sb size: %12u\n", m_empty_sb.size() ); + printf( "m_partfull_sb size: %12u\n", m_partfull_sb.size() ); + printf( "\n" ); + fflush( stdout ); #endif - } - /// \brief Releases a lock on a freelist. - KOKKOS_FUNCTION - uint64_t acquire_lock( volatile uint64_t * freelist ) const; +#ifdef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO + // Print the blocksize info for all the block sizes. + printf( "SIZE BLOCKS_PER_SB PAGES_PER_SB SB_FULL_LEVEL PAGE_FULL_LEVEL\n" ); + for ( size_t i = 0; i < m_num_block_size; ++i ) { + printf( "%4zu %13u %12u %13u %15u\n", i + LG_MIN_BLOCK_SIZE, + m_blocksize_info[i].m_blocks_per_sb, m_blocksize_info[i].m_pages_per_sb, + m_blocksize_info[i].m_sb_full_level, m_blocksize_info[i].m_page_full_level ); + } + printf( "\n" ); +#endif + } - /// \brief Releases a lock on a freelist. + /// \brief The actual block size allocated given alloc_size. + KOKKOS_INLINE_FUNCTION + size_t allocate_block_size( const size_t alloc_size ) const + { return size_t(1) << ( get_block_size_index( alloc_size ) + LG_MIN_BLOCK_SIZE); } + + /// \brief Allocate a chunk of memory. + /// \param alloc_size Size of the requested allocated in number of bytes. + /// + /// The function returns a void pointer to a memory location on success and + /// NULL on failure. KOKKOS_FUNCTION - void release_lock( volatile uint64_t * freelist, uint64_t new_head ) const; + void * allocate( size_t alloc_size ) const + { + void * p = 0; - /// \brief Tries to refill a freelist using a chunk from another freelist. - KOKKOS_FUNCTION - void * refill_freelist( size_t l_exp ) const; + // Only support allocations up to the superblock size. Just return 0 + // (failed allocation) for any size above this. + if (alloc_size <= m_sb_size ) + { + int block_size_id = get_block_size_index( alloc_size ); + uint32_t blocks_per_sb = m_blocksize_info[block_size_id].m_blocks_per_sb; + uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb; + unsigned word_size = blocks_per_sb > 32 ? 32 : blocks_per_sb; + unsigned word_mask = ( uint64_t(1) << word_size ) - 1; - /// \brief Claim chunks of untracked memory from the pool. - KOKKOS_FUNCTION - void * allocate( size_t alloc_size ) const; + uint32_t sb_id = volatile_load( &m_active(block_size_id) ); - /// \brief Release claimed memory back into the pool. - KOKKOS_FUNCTION - void deallocate( void * alloc_ptr, size_t alloc_size ) const; + // If the active is locked, keep reading it until the lock is released. + while ( sb_id == SUPERBLOCK_LOCK ) { + sb_id = volatile_load( &m_active(block_size_id) ); + } - // \brief Pulls the offset from a freelist head. - KOKKOS_INLINE_FUNCTION - uint64_t get_head_offset(uint64_t head) const - { return ( head >> TAGBITS ) << LG_MIN_CHUNKSIZE; } + bool allocation_done = false; + + while (!allocation_done) { + bool need_new_sb = false; + + if (sb_id != INVALID_SUPERBLOCK) { + // Use the value from the clock register as the hash value. + uint64_t hash_val = get_clock_register(); + + // Get the starting position for this superblock's bits in the bitset. + uint32_t pos_base = sb_id << m_lg_max_sb_blocks; + + // Mod the hash value to choose a page in the superblock. The + // initial block searched is the first block of that page. + uint32_t pos_rel = uint32_t( hash_val & ( pages_per_sb - 1 ) ) << LG_BLOCKS_PER_PAGE; + + // Get the absolute starting position for this superblock's bits in the bitset. + uint32_t pos = pos_base + pos_rel; + + // Keep track of the number of pages searched. Pages in the superblock are + // searched linearly from the starting page. All pages in the superblock are + // searched until either a location is found, or it is proven empty. + uint32_t pages_searched = 0; + + bool search_done = false; + + while (!search_done) { + bool success; + unsigned prev_val; + + Kokkos::tie( success, pos ) = + m_sb_blocks.set_any_in_word( pos, prev_val, word_mask ); + + if ( !success ) { + if ( ++pages_searched >= pages_per_sb ) { + // Searched all the pages in this superblock. Look for a new superblock. + // + // The previous method tried limiting the number of pages searched, but + // that caused a huge performance issue in CUDA where the outer loop + // executed massive numbers of times. Threads weren't able to find a + // free location when the superblock wasn't full and were able to execute + // the outer loop many times before the superblock was switched for a new + // one. Switching to an exhaustive search eliminated this possiblity and + // didn't slow anything down for the tests. + need_new_sb = true; + search_done = true; + } + else { + // Move to the next page making sure the new search position + // doesn't go past this superblock's bits. + pos += BLOCKS_PER_PAGE; + pos = ( pos < pos_base + blocks_per_sb ) ? pos : pos_base; + } + } + else { + // Reserved a memory location to allocate. + search_done = true; + allocation_done = true; + + uint32_t lg_block_size = block_size_id + LG_MIN_BLOCK_SIZE; + + p = m_data + ( size_t(sb_id) << m_lg_sb_size ) + + ( ( pos - pos_base ) << lg_block_size ); + + uint32_t used_bits = Kokkos::Impl::bit_count( prev_val ); + + if ( used_bits == 0 ) { + // This page was empty. Decrement the number of empty pages for + // the superblock. + atomic_fetch_sub( &m_sb_header(sb_id).m_empty_pages, 1 ); + } + else if ( used_bits == m_blocksize_info[block_size_id].m_page_full_level - 1 ) + { + // This page is full. Increment the number of full pages for + // the superblock. + uint32_t full_pages = atomic_fetch_add( &m_sb_header(sb_id).m_full_pages, 1 ); + + // This allocation made the superblock full, so a new one needs to be found. + if ( full_pages == m_blocksize_info[block_size_id].m_sb_full_level - 1 ) { + need_new_sb = true; + } + } + } + } + } + else { + // This is the first allocation for this block size. A superblock needs + // to be set as the active one. If this point is reached any other time, + // it is an error. + need_new_sb = true; + } + + if ( need_new_sb ) { + uint32_t new_sb_id = find_superblock( block_size_id, sb_id ); + + if ( new_sb_id == sb_id ) { + allocation_done = true; +#ifdef KOKKOS_MEMPOOL_PRINT_INFO + printf( "** No superblocks available. **\n" ); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + fflush( stdout ); +#endif +#endif + } + else { + sb_id = new_sb_id; + } + } + } + } +#ifdef KOKKOS_MEMPOOL_PRINT_INFO + else { + printf( "** Requested allocation size (%zu) larger than superblock size (%lu). **\n", + alloc_size, m_sb_size); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + fflush( stdout ); +#endif + } +#endif - // \brief Pulls the tag from a freelist head. - KOKKOS_INLINE_FUNCTION - uint64_t get_head_tag(uint64_t head) const { return head & TAGBITS_MASK; } - // \brief Creates a freelist head from a offset and tag. - KOKKOS_INLINE_FUNCTION - uint64_t create_head(uint64_t offset, uint64_t tag) const - { return ( ( offset >> LG_MIN_CHUNKSIZE ) << TAGBITS ) | tag; } + return p; + } - // \brief Increments a tag. - KOKKOS_INLINE_FUNCTION - uint64_t increment_tag(uint64_t tag) const { return ( tag + 1 ) & TAGBITS_MASK; } + /// \brief Release allocated memory back to the pool. + /// \param alloc_ptr Pointer to chunk of memory previously allocated by + /// the allocator. + /// \param alloc_size Size of the allocated memory in number of bytes. + KOKKOS_FUNCTION + void deallocate( void * alloc_ptr, size_t alloc_size ) const + { + char * ap = static_cast<char *>( alloc_ptr ); + + // Only deallocate memory controlled by this pool. + if ( ap >= m_data && ap + alloc_size <= m_data + m_data_size ) { + // Get the superblock for the address. This can be calculated by math on + // the address since the superblocks are stored contiguously in one memory + // chunk. + uint32_t sb_id = ( ap - m_data ) >> m_lg_sb_size; + + // Get the starting position for this superblock's bits in the bitset. + uint32_t pos_base = sb_id << m_lg_max_sb_blocks; + + // Get the relative position for this memory location's bit in the bitset. + uint32_t offset = ( ap - m_data ) - ( size_t(sb_id) << m_lg_sb_size ); + uint32_t lg_block_size = m_sb_header(sb_id).m_lg_block_size; + uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE; + uint32_t pos_rel = offset >> lg_block_size; + + bool success; + unsigned prev_val; + + Kokkos::tie( success, prev_val ) = m_sb_blocks.fetch_word_reset( pos_base + pos_rel ); + + // If the memory location was previously deallocated, do nothing. + if ( success ) { + uint32_t page_fill_level = Kokkos::Impl::bit_count( prev_val ); + + if ( page_fill_level == 1 ) { + // This page is now empty. Increment the number of empty pages for the + // superblock. + uint32_t empty_pages = atomic_fetch_add( &m_sb_header(sb_id).m_empty_pages, 1 ); + + if ( !volatile_load( &m_sb_header(sb_id).m_is_active ) && + empty_pages == m_blocksize_info[block_size_id].m_pages_per_sb - 1 ) + { + // This deallocation caused the superblock to be empty. Change the + // superblock category from partially full to empty. + unsigned pos = block_size_id * m_ceil_num_sb + sb_id; + + if ( m_partfull_sb.reset( pos ) ) { + // Reset the empty pages and block size for the superblock. + volatile_store( &m_sb_header(sb_id).m_empty_pages, uint32_t(0) ); + volatile_store( &m_sb_header(sb_id).m_lg_block_size, uint32_t(0) ); + + memory_fence(); + + m_empty_sb.set( sb_id ); + } + } + } + else if ( page_fill_level == m_blocksize_info[block_size_id].m_page_full_level ) { + // This page is no longer full. Decrement the number of full pages for + // the superblock. + uint32_t full_pages = atomic_fetch_sub( &m_sb_header(sb_id).m_full_pages, 1 ); + + if ( !volatile_load( &m_sb_header(sb_id).m_is_active ) && + full_pages == m_blocksize_info[block_size_id].m_sb_full_level ) + { + // This deallocation caused the number of full pages to decrease below + // the full threshold. Change the superblock category from full to + // partially full. + unsigned pos = block_size_id * m_ceil_num_sb + sb_id; + m_partfull_sb.set( pos ); + } + } + } + } +#ifdef KOKKOS_MEMPOOL_PRINTERR + else { + printf( "\n** MemoryPool::deallocate() ADDRESS_OUT_OF_RANGE(0x%llx) **\n", + reinterpret_cast<uint64_t>( alloc_ptr ) ); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + fflush( stdout ); +#endif + } +#endif + } - /// \brief Tests if the memory pool is empty. + /// \brief Tests if the memory pool has no more memory available to allocate. KOKKOS_INLINE_FUNCTION bool is_empty() const { - size_t l = 0; - while ( m_chunk_size[l] > 0 && - get_head_offset( m_freelist[l] ) == FREELIST_END ) - { - ++l; + // The allocator is empty if all superblocks are full. A superblock is + // full if it has >= 80% of its pages allocated. + + // Look at all the superblocks. If one is not full, then the allocator + // isn't empty. + for ( size_t i = 0; i < m_num_sb; ++i ) { + uint32_t lg_block_size = m_sb_header(i).m_lg_block_size; + + // A superblock only has a block size of 0 when it is empty. + if ( lg_block_size == 0 ) return false; + + uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE; + uint32_t full_pages = volatile_load( &m_sb_header(i).m_full_pages ); + + if ( full_pages < m_blocksize_info[block_size_id].m_sb_full_level ) return false; } - return m_chunk_size[l] == 0; + // All the superblocks were full. The allocator is empty. + return true; } // The following functions are used for debugging. void print_status() const { - for ( size_t l = 0; m_chunk_size[l] > 0; ++l ) { - size_t count = 0; - uint64_t chunk = get_head_offset( m_freelist[l] ); + printf( "\n" ); - while ( chunk != FREELIST_END ) { - ++count; - chunk = *reinterpret_cast<uint64_t *>( m_data + chunk ); - } +#ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO + typename SBHeaderView::HostMirror host_sb_header = create_mirror_view(m_sb_header); + deep_copy( host_sb_header, m_sb_header ); + + UInt32View num_allocated_blocks( "Allocated Blocks", m_num_sb ); - printf( "chunk_size: %6lu num_chunks: %8lu\n", m_chunk_size[l], count ); - fflush(stdout); + // Count the number of allocated blocks per superblock. + { + MempoolImpl::count_allocated_blocks< UInt32View, SBHeaderView, MempoolBitset > + mch( m_num_sb, num_allocated_blocks, m_sb_header, + m_sb_blocks, m_sb_size, m_lg_max_sb_blocks ); } - } - KOKKOS_INLINE_FUNCTION - size_t get_min_chunk_size() const { return m_chunk_size[0]; } + typename UInt32View::HostMirror host_num_allocated_blocks = + create_mirror_view(num_allocated_blocks); + deep_copy( host_num_allocated_blocks, num_allocated_blocks ); + + // Print header info of all superblocks. + printf( "SB_ID SIZE ACTIVE EMPTY_PAGES FULL_PAGES USED_BLOCKS\n" ); + for ( size_t i = 0; i < m_num_sb; ++i ) { + printf( "%5zu %4u %6d %11u %10u %10u\n", i, + host_sb_header(i).m_lg_block_size, host_sb_header(i).m_is_active, + host_sb_header(i).m_empty_pages, host_sb_header(i).m_full_pages, + host_num_allocated_blocks(i) ); + } - size_t get_mem_size() const { return m_data_size; } -}; + printf( "\n" ); +#endif -} // namespace Impl -} // namespace Experimental -} // namespace Kokkos + UInt32View page_histogram( "Page Histogram", 33 ); -//---------------------------------------------------------------------------- -/* Prefer to implement these functions in a separate - * compilation unit. For CUDA this requires nvcc command - * --relocatable-device-code=true - * When this command is set then the macro - * KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE - * is also set. - */ -#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && \ - ! defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) - -#include <impl/Kokkos_MemoryPool_Inline.hpp> + // Get a View version of the blocksize info. + typedef View< BlockSizeHeader *, device_type > BSHeaderView; + BSHeaderView blocksize_info( "BlockSize Headers", MAX_BLOCK_SIZES ); -#endif + Kokkos::Impl::DeepCopy< backend_memory_space, Kokkos::HostSpace > + dc( blocksize_info.ptr_on_device(), m_blocksize_info, + sizeof(BlockSizeHeader) * m_num_block_size ); -//---------------------------------------------------------------------------- + Kokkos::pair< double, uint32_t > result = Kokkos::pair< double, uint32_t >( 0.0, 0 ); -namespace Kokkos { -namespace Experimental { + // Create the page histogram. + { + MempoolImpl::create_histogram< UInt32View, BSHeaderView, SBHeaderView, MempoolBitset > + mch( 0, m_num_sb, page_histogram, blocksize_info, m_sb_header, m_sb_blocks, + m_lg_max_sb_blocks, LG_MIN_BLOCK_SIZE, BLOCKS_PER_PAGE, result ); + } -/// \class MemoryPool -/// \brief Memory management for pool of same-sized chunks of memory. -/// -/// MemoryPool is a memory space that can be on host or device. It provides a -/// pool memory allocator for fast allocation of same-sized chunks of memory. -/// The memory is only accessible on the host / device this allocator is -/// associated with. -template < typename Space , typename ExecSpace > -class MemoryPool { -private: + typename UInt32View::HostMirror host_page_histogram = create_mirror_view(page_histogram); + deep_copy( host_page_histogram, page_histogram ); - Impl::MemPoolList m_memory; + // Find the used and total pages and blocks. + uint32_t used_pages = 0; + uint32_t used_blocks = 0; + for ( uint32_t i = 1; i < 33; ++i ) { + used_pages += host_page_histogram(i); + used_blocks += i * host_page_histogram(i); + } + uint32_t total_pages = used_pages + host_page_histogram(0); + + unsigned num_empty_sb = m_empty_sb.count(); + unsigned num_non_empty_sb = m_num_sb - num_empty_sb; + unsigned num_partfull_sb = m_partfull_sb.count(); - typedef ExecSpace execution_space; - typedef typename Space::memory_space backend_memory_space; + uint32_t total_blocks = result.second; + double ave_sb_full = num_non_empty_sb == 0 ? 0.0 : result.first / num_non_empty_sb; + double percent_used_sb = double( m_num_sb - num_empty_sb ) / m_num_sb; + double percent_used_pages = total_pages == 0 ? 0.0 : double(used_pages) / total_pages; + double percent_used_blocks = total_blocks == 0 ? 0.0 : double(used_blocks) / total_blocks; -#if defined( KOKKOS_HAVE_CUDA ) + // Count active superblocks. + typename UInt32View::HostMirror host_active = create_mirror_view(m_active); + deep_copy(host_active, m_active); - // Current implementation requires CudaUVM memory space - // for Cuda memory pool. + unsigned num_active_sb = 0; + for ( size_t i = 0; i < m_num_block_size; ++i ) { + num_active_sb += host_active(i) != INVALID_SUPERBLOCK; + } - static_assert( - ! std::is_same< typename Space::memory_space , Kokkos::CudaSpace >::value , - "Kokkos::MemoryPool currently cannot use Kokkos::CudaSpace, you must use Kokkos::CudaUVMSpace" ); +#ifdef KOKKOS_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS + // Print active superblocks. + printf( "BS_ID SB_ID\n" ); + for ( size_t i = 0; i < m_num_block_size; ++i ) { + uint32_t sb_id = host_active(i); + if ( sb_id == INVALID_SUPERBLOCK ) { + printf( "%5zu I\n", i ); + } + else if ( sb_id == SUPERBLOCK_LOCK ) { + printf( "%5zu L\n", i ); + } + else { + printf( "%5zu %7u\n", i, sb_id ); + } + } + printf( "\n" ); + fflush( stdout ); #endif -public: +#ifdef KOKKOS_MEMPOOL_PRINT_PAGE_INFO + // Print the summary page histogram. + printf( "USED_BLOCKS PAGE_COUNT\n" ); + for ( uint32_t i = 0; i < 33; ++i ) { + printf( "%10u %10u\n", i, host_page_histogram[i] ); + } + printf( "\n" ); +#endif - //! Tag this class as a kokkos memory space - typedef MemoryPool memory_space; +#ifdef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO + // Print the page histogram for a few individual superblocks. +// const uint32_t num_sb_id = 2; +// uint32_t sb_id[num_sb_id] = { 0, 10 }; + const uint32_t num_sb_id = 1; + uint32_t sb_id[num_sb_id] = { 0 }; - //------------------------------------ + for ( uint32_t i = 0; i < num_sb_id; ++i ) { + deep_copy( page_histogram, 0 ); - MemoryPool() = default; - MemoryPool( MemoryPool && rhs ) = default; - MemoryPool( const MemoryPool & rhs ) = default; - MemoryPool & operator = ( MemoryPool && ) = default; - MemoryPool & operator = ( const MemoryPool & ) = default; - ~MemoryPool() = default; + { + MempoolImpl::create_histogram< UInt32View, BSHeaderView, SBHeaderView, MempoolBitset > + mch( sb_id[i], sb_id[i] + 1, page_histogram, blocksize_info, m_sb_header, + m_sb_blocks, m_lg_max_sb_blocks, LG_MIN_BLOCK_SIZE, BLOCKS_PER_PAGE, result ); + } - /// \brief Allocate memory pool - /// \param memspace From where to allocate the pool. - /// \param base_chunk_size Hand out memory in chunks of this size. - /// \param total_size Total size of the pool. - MemoryPool( const backend_memory_space & memspace, - size_t base_chunk_size, size_t total_size, - size_t num_chunk_sizes = 4, size_t chunk_spacing = 4 ) - : m_memory( memspace, execution_space(), base_chunk_size, total_size, - num_chunk_sizes, chunk_spacing ) - {} - - /// \brief Claim chunks of untracked memory from the pool. - /// Can only be called from device. - KOKKOS_INLINE_FUNCTION - void * allocate( const size_t alloc_size ) const - { return m_memory.allocate( alloc_size ); } + deep_copy( host_page_histogram, page_histogram ); - /// \brief Release claimed memory back into the pool - /// Can only be called from device. - KOKKOS_INLINE_FUNCTION - void deallocate( void * const alloc_ptr, const size_t alloc_size ) const - { m_memory.deallocate( alloc_ptr, alloc_size ); } + printf( "SB_ID USED_BLOCKS PAGE_COUNT\n" ); + for ( uint32_t j = 0; j < 33; ++j ) { + printf( "%5u %10u %10u\n", sb_id[i], j, host_page_histogram[j] ); + } + printf( "\n" ); + } - /// \brief Is out of memory at this instant - KOKKOS_INLINE_FUNCTION - bool is_empty() const { return m_memory.is_empty(); } +/* + // Print the blocks used for each page of a few individual superblocks. + for ( uint32_t i = 0; i < num_sb_id; ++i ) { + uint32_t lg_block_size = host_sb_header(sb_id[i]).m_lg_block_size; + if ( lg_block_size != 0 ) { + printf( "SB_ID BLOCK ID USED_BLOCKS\n" ); + + uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE; + uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb; + + for ( uint32_t j = 0; j < pages_per_sb; ++j ) { + unsigned start_pos = ( sb_id[i] << m_lg_max_sb_blocks ) + j * BLOCKS_PER_PAGE; + unsigned end_pos = start_pos + BLOCKS_PER_PAGE; + uint32_t num_allocated_blocks = 0; + + for ( unsigned k = start_pos; k < end_pos; ++k ) { + num_allocated_blocks += m_sb_blocks.test( k ); + } + + printf( "%5u %8u %11u\n", sb_id[i], j, num_allocated_blocks ); + } + + printf( "\n" ); + } + } +*/ +#endif + + printf( " Used blocks: %10u / %10u = %10.6lf\n", used_blocks, total_blocks, + percent_used_blocks ); + printf( " Used pages: %10u / %10u = %10.6lf\n", used_pages, total_pages, + percent_used_pages ); + printf( " Used SB: %10zu / %10zu = %10.6lf\n", m_num_sb - num_empty_sb, m_num_sb, + percent_used_sb ); + printf( " Active SB: %10u\n", num_active_sb ); + printf( " Empty SB: %10u\n", num_empty_sb ); + printf( " Partfull SB: %10u\n", num_partfull_sb ); + printf( " Full SB: %10lu\n", + m_num_sb - num_active_sb - num_empty_sb - num_partfull_sb ); + printf( "Ave. SB Full %%: %10.6lf\n", ave_sb_full ); + printf( "\n" ); + fflush( stdout ); + +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + fflush( stdout ); +#endif + } - /// \brief Minimum chunk size allocatable. KOKKOS_INLINE_FUNCTION - size_t get_min_chunk_size() const { return m_memory.get_min_chunk_size(); } + size_t get_min_block_size() const { return MIN_BLOCK_SIZE; } - // The following functions are used for debugging. - void print_status() const { m_memory.print_status(); } - size_t get_mem_size() const { return m_memory.get_mem_size(); } + size_t get_mem_size() const { return m_data_size; } + +private: + /// \brief Returns the index into the active array for the given size. + /// + /// Computes log2 of the largest power of two >= the given size + /// ( ie ceil( log2(size) ) ) shifted by LG_MIN_BLOCK_SIZE. + KOKKOS_FORCEINLINE_FUNCTION + int get_block_size_index( const size_t size ) const + { + // We know the size fits in a 32 bit unsigned because the size of a + // superblock is limited to 2^31, so casting to an unsigned is safe. + + // Find the most significant nonzero bit. + uint32_t first_nonzero_bit = + Kokkos::Impl::bit_scan_reverse( static_cast<unsigned>( size ) ); + + // If size is an integral power of 2, ceil( log2(size) ) is equal to the + // most significant nonzero bit. Otherwise, you need to add 1. Since the + // minimum block size is MIN_BLOCK_SIZE, make sure ceil( log2(size) ) is at + // least LG_MIN_BLOCK_SIZE. + uint32_t lg2_size = first_nonzero_bit + !Kokkos::Impl::is_integral_power_of_two( size ); + lg2_size = lg2_size > LG_MIN_BLOCK_SIZE ? lg2_size : LG_MIN_BLOCK_SIZE; + + // Return ceil( log2(size) ) shifted so that the value for MIN_BLOCK_SIZE + // is 0. + return lg2_size - LG_MIN_BLOCK_SIZE; + } + + /// \brief Finds a superblock with free space to become a new active superblock. + /// + /// If this function is called, the current active superblock needs to be replaced + /// because it is full. Initially, only the thread that sets the active superblock + /// to full calls this function. Other threads can still allocate from the "full" + /// active superblock because a full superblock still has locations available. If + /// a thread tries to allocate from the active superblock when it has no free + /// locations, then that thread will call this function, too, and spin on a lock + /// waiting until the active superblock has been replaced. + KOKKOS_FUNCTION + uint32_t find_superblock( int block_size_id, uint32_t old_sb ) const + { + // Try to grab the lock on the head. + uint32_t lock_sb = + Kokkos::atomic_compare_exchange( &m_active(block_size_id), old_sb, SUPERBLOCK_LOCK ); + + // Initialize the new superblock to be the previous one so the previous + // superblock is returned if a new superblock can't be found. + uint32_t new_sb = lock_sb; + + if ( lock_sb == old_sb ) { + // This thread has the lock. + + // 1. Look for a partially filled superblock that is of the right block + // size. + + size_t max_tries = m_ceil_num_sb >> LG_BLOCKS_PER_PAGE; + size_t tries = 0; + bool search_done = false; + + // Set the starting search position to the beginning of this block + // size's bitset. + unsigned pos = block_size_id * m_ceil_num_sb; + + while (!search_done) { + bool success = false; + unsigned prev_val; + + Kokkos::tie( success, pos ) = m_partfull_sb.reset_any_in_word( pos, prev_val ); + + if ( !success ) { + if ( ++tries >= max_tries ) { + // Exceeded number of words for this block size's bitset. + search_done = true; + } + else { + pos += BLOCKS_PER_PAGE; + } + } + else { + // Found a superblock. + search_done = true; + new_sb = pos - block_size_id * m_ceil_num_sb; + + // Assertions: + // 1. A different superblock than the current should be found. +#ifdef KOKKOS_MEMPOOL_PRINTERR + if ( new_sb == lock_sb ) { + printf( "\n** MemoryPool::find_superblock() FOUND_SAME_SUPERBLOCK: %u **\n", + new_sb); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + fflush( stdout ); +#endif + Kokkos::abort( "" ); + } +#endif + + // Set the head status for the superblock. + volatile_store( &m_sb_header(new_sb).m_is_active, uint32_t(true) ); + + // If there was a previous active superblock, mark it as not active. + // It is now in the full category and as such isn't tracked. + if ( lock_sb != INVALID_SUPERBLOCK ) { + volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(false) ); + } + + memory_fence(); + } + } + + // 2. Look for an empty superblock. + if ( new_sb == lock_sb ) { + tries = 0; + search_done = false; + + // Set the starting search position to the beginning of this block + // size's bitset. + pos = 0; + + while (!search_done) { + bool success = false; + unsigned prev_val; + + Kokkos::tie( success, pos ) = m_empty_sb.reset_any_in_word( pos, prev_val ); + + if ( !success ) { + if ( ++tries >= max_tries ) { + // Exceeded number of words for this block size's bitset. + search_done = true; + } + else { + pos += BLOCKS_PER_PAGE; + } + } + else { + // Found a superblock. + search_done = true; + new_sb = pos; + + // Assertions: + // 1. A different superblock than the current should be found. +#ifdef KOKKOS_MEMPOOL_PRINTERR + if ( new_sb == lock_sb ) { + printf( "\n** MemoryPool::find_superblock() FOUND_SAME_SUPERBLOCK: %u **\n", + new_sb); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + fflush( stdout ); +#endif + Kokkos::abort( "" ); + } +#endif + + // Set the empty pages, block size, and head status for the + // superblock. + volatile_store( &m_sb_header(new_sb).m_empty_pages, + m_blocksize_info[block_size_id].m_pages_per_sb ); + volatile_store( &m_sb_header(new_sb).m_lg_block_size, + block_size_id + LG_MIN_BLOCK_SIZE ); + volatile_store( &m_sb_header(new_sb).m_is_active, uint32_t(true) ); + + // If there was a previous active superblock, mark it as not active. + // It is now in the full category and as such isn't tracked. + if ( lock_sb != INVALID_SUPERBLOCK ) { + volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(false) ); + } + + memory_fence(); + } + } + } + + // Write the new active superblock to release the lock. + atomic_exchange( &m_active(block_size_id), new_sb ); + } + else { + // Either another thread has the lock and is switching the active superblock for + // this block size or another thread has already changed the active superblock + // since this thread read its value. Keep reading the active superblock until + // it isn't locked to get the new active superblock. + do { + new_sb = volatile_load( &m_active(block_size_id) ); + } while ( new_sb == SUPERBLOCK_LOCK ); + + // Assertions: + // 1. An invalid superblock should never be found here. + // 2. If the new superblock is the same as the previous superblock, the + // allocator is empty. +#ifdef KOKKOS_MEMPOOL_PRINTERR + if ( new_sb == INVALID_SUPERBLOCK ) { + printf( "\n** MemoryPool::find_superblock() FOUND_INACTIVE_SUPERBLOCK **\n" ); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + fflush( stdout ); +#endif + Kokkos::abort( "" ); + } +#endif + } + + return new_sb; + } + + /// Returns 64 bits from a clock register. + KOKKOS_FORCEINLINE_FUNCTION + uint64_t get_clock_register(void) const + { +#if defined( __CUDA_ARCH__ ) + // Return value of 64-bit hi-res clock register. + return clock64(); +#elif defined( __i386__ ) || defined( __x86_64 ) + // Return value of 64-bit hi-res clock register. + unsigned a, d; + __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); + return ( (uint64_t) a) | ( ( (uint64_t) d ) << 32 ); +#else + const uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + return ticks; +#endif + } }; } // namespace Experimental @@ -583,4 +1501,23 @@ public: #undef KOKKOS_MEMPOOL_PRINT_INFO #endif -#endif /* #define KOKKOS_MEMORYPOOL_HPP */ +#ifdef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO +#undef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO +#endif + +#ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO +#undef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO +#endif + +#ifdef KOKKOS_MEMPOOL_PRINT_PAGE_INFO +#undef KOKKOS_MEMPOOL_PRINT_PAGE_INFO +#endif + +#ifdef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO +#undef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO +#endif + +#undef KOKKOS_MEMPOOL_SB_FULL_FRACTION +#undef KOKKOS_MEMPOOL_PAGE_FULL_FRACTION + +#endif // KOKKOS_MEMORYPOOL_HPP diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp index 389ee4b2fd..7be4f8245f 100644 --- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp +++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -58,9 +58,11 @@ #endif #include <Kokkos_ScratchSpace.hpp> #include <Kokkos_Parallel.hpp> +#include <Kokkos_TaskPolicy.hpp> #include <Kokkos_Layout.hpp> #include <impl/Kokkos_Tags.hpp> +#include <KokkosExp_MDRangePolicy.hpp> /*--------------------------------------------------------------------------*/ namespace Kokkos { @@ -177,6 +179,7 @@ struct VerifyExecutionCanAccessMemorySpace #include <OpenMP/Kokkos_OpenMPexec.hpp> #include <OpenMP/Kokkos_OpenMP_Parallel.hpp> +#include <OpenMP/Kokkos_OpenMP_Task.hpp> /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp index 7e906a4571..83436826f4 100644 --- a/lib/kokkos/core/src/Kokkos_Pair.hpp +++ b/lib/kokkos/core/src/Kokkos_Pair.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -35,7 +35,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER @@ -125,17 +125,26 @@ struct pair return *this; } - /// \brief Assignment operator. + + /// \brief Assignment operator, for volatile <tt>*this</tt>. /// - /// This calls the assignment operators of T1 and T2. It won't + /// \param p [in] Input; right-hand side of the assignment. + /// + /// This calls the assignment operators of T1 and T2. It will not /// compile if the assignment operators are not defined and public. + /// + /// This operator returns \c void instead of <tt>volatile pair<T1, + /// T2>& </tt>. See Kokkos Issue #177 for the explanation. In + /// practice, this means that you should not chain assignments with + /// volatile lvalues. template <class U, class V> KOKKOS_FORCEINLINE_FUNCTION - volatile pair<T1, T2> & operator=(const volatile pair<U,V> &p) volatile + void operator=(const volatile pair<U,V> &p) volatile { first = p.first; second = p.second; - return *this; + // We deliberately do not return anything here. See explanation + // in public documentation above. } // from std::pair<U,V> diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp index edaced22a9..588dc90af3 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp @@ -57,7 +57,6 @@ #include <typeinfo> #endif -#include <impl/Kokkos_AllocationTracker.hpp> #include <impl/Kokkos_Tags.hpp> #include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_FunctorAdapter.hpp> @@ -178,8 +177,8 @@ void parallel_for( const ExecPolicy & policy { #if (KOKKOS_ENABLE_PROFILING) uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); } #endif @@ -190,8 +189,8 @@ void parallel_for( const ExecPolicy & policy closure.execute(); #if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelFor(kpID); + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); } #endif } @@ -210,8 +209,8 @@ void parallel_for( const size_t work_count #if (KOKKOS_ENABLE_PROFILING) uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); } #endif @@ -222,8 +221,8 @@ void parallel_for( const size_t work_count closure.execute(); #if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelFor(kpID); + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); } #endif } @@ -248,405 +247,9 @@ void parallel_for( const std::string & str (void) str; } -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -/** \brief Parallel reduction - * - * Example of a parallel_reduce functor for a POD (plain old data) value type: - * \code - * class FunctorType { // For POD value type - * public: - * typedef ... execution_space ; - * typedef <podType> value_type ; - * void operator()( <intType> iwork , <podType> & update ) const ; - * void init( <podType> & update ) const ; - * void join( volatile <podType> & update , - * volatile const <podType> & input ) const ; - * - * typedef true_type has_final ; - * void final( <podType> & update ) const ; - * }; - * \endcode - * - * Example of a parallel_reduce functor for an array of POD (plain old data) values: - * \code - * class FunctorType { // For array of POD value - * public: - * typedef ... execution_space ; - * typedef <podType> value_type[] ; - * void operator()( <intType> , <podType> update[] ) const ; - * void init( <podType> update[] ) const ; - * void join( volatile <podType> update[] , - * volatile const <podType> input[] ) const ; - * - * typedef true_type has_final ; - * void final( <podType> update[] ) const ; - * }; - * \endcode - */ -template< class ExecPolicy , class FunctorType > -inline -void parallel_reduce( const ExecPolicy & policy - , const FunctorType & functor - , const std::string& str = "" - , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0 - ) -{ - // typedef typename - // Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space - // execution_space ; - - typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ; - - typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) - , typename ValueTraits::value_type - , typename ValueTraits::pointer_type - >::type value_type ; - - Kokkos::View< value_type - , HostSpace - , Kokkos::MemoryUnmanaged - > - result_view ; - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType , ExecPolicy > closure( functor , policy , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelReduce(kpID); - } -#endif -} - -// integral range policy -template< class FunctorType > -inline -void parallel_reduce( const size_t work_count - , const FunctorType & functor - , const std::string& str = "" - ) -{ - typedef typename - Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space - execution_space ; - - typedef RangePolicy< execution_space > policy ; - - typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; - - typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) - , typename ValueTraits::value_type - , typename ValueTraits::pointer_type - >::type value_type ; - - Kokkos::View< value_type - , HostSpace - , Kokkos::MemoryUnmanaged - > - result_view ; - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelReduce(kpID); - } -#endif - -} - -// general policy and view ouput -template< class ExecPolicy , class FunctorType , class ViewType > -inline -void parallel_reduce( const ExecPolicy & policy - , const FunctorType & functor - , const ViewType & result_view - , const std::string& str = "" - , typename Impl::enable_if< - ( Kokkos::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value -#ifdef KOKKOS_HAVE_CUDA - && ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value -#endif - )>::type * = 0 ) -{ - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelReduce(kpID); - } -#endif - -} - -// general policy and pod or array of pod output -template< class ExecPolicy , class FunctorType > -void parallel_reduce( const ExecPolicy & policy - , const FunctorType & functor -#ifdef KOKKOS_HAVE_CUDA - , typename Impl::enable_if< - ( ! Impl::is_integral< ExecPolicy >::value && - ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value ) - , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type>::type result_ref - , const std::string& str = "" - , typename Impl::enable_if<! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value >::type* = 0 - ) -#else - , typename Impl::enable_if< - ( ! Impl::is_integral< ExecPolicy >::value) - , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type - >::type result_ref - , const std::string& str = "" - ) -#endif -{ - typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ; - - // Wrap the result output request in a view to inform the implementation - // of the type and memory space. - - typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) - , typename ValueTraits::value_type - , typename ValueTraits::pointer_type - >::type value_type ; - - Kokkos::View< value_type - , HostSpace - , Kokkos::MemoryUnmanaged - > - result_view( ValueOps::pointer( result_ref ) - , ValueTraits::value_count( functor ) - ); - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelReduce(kpID); - } -#endif - } -// integral range policy and view ouput -template< class FunctorType , class ViewType > -inline -void parallel_reduce( const size_t work_count - , const FunctorType & functor - , const ViewType & result_view - , const std::string& str = "" - , typename Impl::enable_if<( Kokkos::is_view<ViewType>::value -#ifdef KOKKOS_HAVE_CUDA - && ! Impl::is_same< - typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space, - Kokkos::Cuda>::value -#endif - )>::type * = 0 ) -{ - typedef typename - Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space - execution_space ; - - typedef RangePolicy< execution_space > ExecPolicy ; - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelReduce(kpID); - } -#endif - -} - -// integral range policy and pod or array of pod output -template< class FunctorType > -inline -void parallel_reduce( const size_t work_count - , const FunctorType & functor - , typename Kokkos::Impl::FunctorValueTraits< - typename Impl::if_c<Impl::is_execution_policy<FunctorType>::value || - Impl::is_integral<FunctorType>::value, - void,FunctorType>::type - , void >::reference_type result - , const std::string& str = "" - , typename Impl::enable_if< true -#ifdef KOKKOS_HAVE_CUDA - && ! Impl::is_same< - typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space, - Kokkos::Cuda>::value -#endif - >::type * = 0 ) -{ - typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; - typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ; - - typedef typename - Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space - execution_space ; - - typedef Kokkos::RangePolicy< execution_space > policy ; - - // Wrap the result output request in a view to inform the implementation - // of the type and memory space. - - typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) - , typename ValueTraits::value_type - , typename ValueTraits::pointer_type - >::type value_type ; - - Kokkos::View< value_type - , HostSpace - , Kokkos::MemoryUnmanaged - > - result_view( ValueOps::pointer( result ) - , ValueTraits::value_count( functor ) - ); - -#if (KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } -#endif - - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); - Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); - - closure.execute(); - -#if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelReduce(kpID); - } -#endif - -} -#ifndef KOKKOS_HAVE_CUDA -template< class ExecPolicy , class FunctorType , class ResultType > -inline -void parallel_reduce( const std::string & str - , const ExecPolicy & policy - , const FunctorType & functor - , ResultType * result) -{ - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl; - #endif - - parallel_reduce(policy,functor,result,str); - - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl; - #endif - (void) str; -} - -template< class ExecPolicy , class FunctorType , class ResultType > -inline -void parallel_reduce( const std::string & str - , const ExecPolicy & policy - , const FunctorType & functor - , ResultType & result) -{ - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl; - #endif - - parallel_reduce(policy,functor,result,str); - - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl; - #endif - (void) str; -} - -template< class ExecPolicy , class FunctorType > -inline -void parallel_reduce( const std::string & str - , const ExecPolicy & policy - , const FunctorType & functor) -{ - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl; - #endif - - parallel_reduce(policy,functor,str); - - #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES - Kokkos::fence(); - std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl; - #endif - (void) str; -} -#endif - -} // namespace Kokkos - +#include <Kokkos_Parallel_Reduce.hpp> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -816,8 +419,8 @@ void parallel_scan( const ExecutionPolicy & policy { #if (KOKKOS_ENABLE_PROFILING) uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); } #endif @@ -828,8 +431,8 @@ void parallel_scan( const ExecutionPolicy & policy closure.execute(); #if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelScan(kpID); + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelScan(kpID); } #endif @@ -849,8 +452,8 @@ void parallel_scan( const size_t work_count #if (KOKKOS_ENABLE_PROFILING) uint64_t kpID = 0; - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); } #endif @@ -861,8 +464,8 @@ void parallel_scan( const size_t work_count closure.execute(); #if (KOKKOS_ENABLE_PROFILING) - if(Kokkos::Experimental::profileLibraryLoaded()) { - Kokkos::Experimental::endParallelScan(kpID); + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelScan(kpID); } #endif diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp new file mode 100644 index 0000000000..695bc79a1a --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -0,0 +1,1240 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +namespace Kokkos { + + +template<class T, class Enable = void> +struct is_reducer_type { + enum { value = 0 }; +}; + + +template<class T> +struct is_reducer_type<T,typename std::enable_if< + std::is_same<T,typename T::reducer_type>::value + >::type> { + enum { value = 1 }; +}; + +namespace Experimental { + + +template<class Scalar,class Space = HostSpace> +struct Sum { +public: + //Required + typedef Sum reducer_type; + typedef Scalar value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + + value_type init_value; + +private: + result_view_type result; + + template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value > + struct InitWrapper; + + template<class ValueType > + struct InitWrapper<ValueType,true> { + static ValueType value() { + return static_cast<value_type>(0); + } + }; + + template<class ValueType > + struct InitWrapper<ValueType,false> { + static ValueType value() { + return value_type(); + } + }; + +public: + + Sum(value_type& result_): + init_value(InitWrapper<value_type>::value()),result(&result_) {} + Sum(const result_view_type& result_): + init_value(InitWrapper<value_type>::value()),result(result_) {} + Sum(value_type& result_, const value_type& init_value_): + init_value(init_value_),result(&result_) {} + Sum(const result_view_type& result_, const value_type& init_value_): + init_value(init_value_),result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest += src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest += src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val = init_value; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar,class Space = HostSpace> +struct Prod { +public: + //Required + typedef Prod reducer_type; + typedef Scalar value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + + value_type init_value; + +private: + result_view_type result; + + template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value > + struct InitWrapper; + + template<class ValueType > + struct InitWrapper<ValueType,true> { + static ValueType value() { + return static_cast<value_type>(1); + } + }; + + template<class ValueType > + struct InitWrapper<ValueType,false> { + static ValueType value() { + return value_type(); + } + }; + +public: + + Prod(value_type& result_): + init_value(InitWrapper<value_type>::value()),result(&result_) {} + Prod(const result_view_type& result_): + init_value(InitWrapper<value_type>::value()),result(result_) {} + Prod(value_type& result_, const value_type& init_value_): + init_value(init_value_),result(&result_) {} + Prod(const result_view_type& result_, const value_type& init_value_): + init_value(init_value_),result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest *= src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest *= src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val = init_value; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar, class Space = HostSpace> +struct Min { +public: + //Required + typedef Min reducer_type; + typedef typename std::remove_cv<Scalar>::type value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + + value_type init_value; + +private: + result_view_type result; + + template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value > + struct InitWrapper; + + template<class ValueType > + struct InitWrapper<ValueType,true> { + static ValueType value() { + return std::numeric_limits<value_type>::max(); + } + }; + + template<class ValueType > + struct InitWrapper<ValueType,false> { + static ValueType value() { + return value_type(); + } + }; + +public: + + Min(value_type& result_): + init_value(InitWrapper<value_type>::value()),result(&result_) {} + Min(const result_view_type& result_): + init_value(InitWrapper<value_type>::value()),result(result_) {} + Min(value_type& result_, const value_type& init_value_): + init_value(init_value_),result(&result_) {} + Min(const result_view_type& result_, const value_type& init_value_): + init_value(init_value_),result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if ( src < dest ) + dest = src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if ( src < dest ) + dest = src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val = init_value; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar, class Space = HostSpace> +struct Max { +public: + //Required + typedef Max reducer_type; + typedef typename std::remove_cv<Scalar>::type value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + + value_type init_value; + +private: + result_view_type result; + + template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value > + struct InitWrapper; + + template<class ValueType > + struct InitWrapper<ValueType,true> { + static ValueType value() { + return std::numeric_limits<value_type>::min(); + } + }; + + template<class ValueType > + struct InitWrapper<ValueType,false> { + static ValueType value() { + return value_type(); + } + }; + +public: + + Max(value_type& result_): + init_value(InitWrapper<value_type>::value()),result(&result_) {} + Max(const result_view_type& result_): + init_value(InitWrapper<value_type>::value()),result(result_) {} + Max(value_type& result_, const value_type& init_value_): + init_value(init_value_),result(&result_) {} + Max(const result_view_type& result_, const value_type& init_value_): + init_value(init_value_),result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if ( src > dest ) + dest = src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if ( src > dest ) + dest = src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val = init_value; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar, class Space = HostSpace> +struct LAnd { +public: + //Required + typedef LAnd reducer_type; + typedef Scalar value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + +private: + result_view_type result; + +public: + + LAnd(value_type& result_):result(&result_) {} + LAnd(const result_view_type& result_):result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest = dest && src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest = dest && src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val = 1; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar, class Space = HostSpace> +struct LOr { +public: + //Required + typedef LOr reducer_type; + typedef Scalar value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + +private: + result_view_type result; + +public: + + LOr(value_type& result_):result(&result_) {} + LOr(const result_view_type& result_):result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest = dest || src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest = dest || src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val = 0; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar, class Space = HostSpace> +struct LXor { +public: + //Required + typedef LXor reducer_type; + typedef Scalar value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + +private: + result_view_type result; + +public: + + LXor(value_type& result_):result(&result_) {} + LXor(const result_view_type& result_):result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest = dest? (!src) : src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest = dest? (!src) : src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val = 0; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar, class Space = HostSpace> +struct BAnd { +public: + //Required + typedef BAnd reducer_type; + typedef typename std::remove_cv<Scalar>::type value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + + value_type init_value; + +private: + result_view_type result; + +public: + + BAnd(value_type& result_): + init_value(value_type() | (~value_type())),result(&result_) {} + BAnd(const result_view_type& result_): + init_value(value_type() | (~value_type())),result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest = dest & src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest = dest & src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val = init_value; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar, class Space = HostSpace> +struct BOr { +public: + //Required + typedef BOr reducer_type; + typedef typename std::remove_cv<Scalar>::type value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + + value_type init_value; + +private: + result_view_type result; + +public: + + BOr(value_type& result_): + init_value(value_type() & (~value_type())),result(&result_) {} + BOr(const result_view_type& result_): + init_value(value_type() & (~value_type())),result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest = dest | src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest = dest | src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val = init_value; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar, class Space = HostSpace> +struct BXor { +public: + //Required + typedef BXor reducer_type; + typedef typename std::remove_cv<Scalar>::type value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + + value_type init_value; + +private: + result_view_type result; + +public: + + BXor(value_type& result_): + init_value(value_type() & (~value_type())),result(&result_) {} + BXor(const result_view_type& result_): + init_value(value_type() & (~value_type())),result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest = dest ^ src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest = dest ^ src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val = init_value; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar, class Index> +struct ValLocScalar { + Scalar val; + Index loc; + + KOKKOS_INLINE_FUNCTION + void operator = (const ValLocScalar& rhs) { + val = rhs.val; + loc = rhs.loc; + } + + KOKKOS_INLINE_FUNCTION + void operator = (const volatile ValLocScalar& rhs) volatile { + val = rhs.val; + loc = rhs.loc; + } +}; + +template<class Scalar, class Index, class Space = HostSpace> +struct MinLoc { +private: + typedef typename std::remove_cv<Scalar>::type scalar_type; + typedef typename std::remove_cv<Index>::type index_type; + +public: + //Required + typedef MinLoc reducer_type; + typedef ValLocScalar<scalar_type,index_type> value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + + scalar_type init_value; + +private: + result_view_type result; + + template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value > + struct InitWrapper; + + template<class ValueType > + struct InitWrapper<ValueType,true> { + static ValueType value() { + return std::numeric_limits<scalar_type>::max(); + } + }; + + template<class ValueType > + struct InitWrapper<ValueType,false> { + static ValueType value() { + return scalar_type(); + } + }; + +public: + + MinLoc(value_type& result_): + init_value(InitWrapper<scalar_type>::value()),result(&result_) {} + MinLoc(const result_view_type& result_): + init_value(InitWrapper<scalar_type>::value()),result(result_) {} + MinLoc(value_type& result_, const scalar_type& init_value_): + init_value(init_value_),result(&result_) {} + MinLoc(const result_view_type& result_, const scalar_type& init_value_): + init_value(init_value_),result(result_) {} + + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if ( src.val < dest.val ) + dest = src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if ( src.val < dest.val ) + dest = src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val.val = init_value; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar, class Index, class Space = HostSpace> +struct MaxLoc { +private: + typedef typename std::remove_cv<Scalar>::type scalar_type; + typedef typename std::remove_cv<Index>::type index_type; + +public: + //Required + typedef MaxLoc reducer_type; + typedef ValLocScalar<scalar_type,index_type> value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + + scalar_type init_value; + +private: + result_view_type result; + + template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value > + struct InitWrapper; + + template<class ValueType > + struct InitWrapper<ValueType,true> { + static ValueType value() { + return std::numeric_limits<scalar_type>::min(); + } + }; + + template<class ValueType > + struct InitWrapper<ValueType,false> { + static ValueType value() { + return scalar_type(); + } + }; + +public: + + MaxLoc(value_type& result_): + init_value(InitWrapper<scalar_type>::value()),result(&result_) {} + MaxLoc(const result_view_type& result_): + init_value(InitWrapper<scalar_type>::value()),result(result_) {} + MaxLoc(value_type& result_, const scalar_type& init_value_): + init_value(init_value_),result(&result_) {} + MaxLoc(const result_view_type& result_, const scalar_type& init_value_): + init_value(init_value_),result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if ( src.val > dest.val ) + dest = src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if ( src.val > dest.val ) + dest = src; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val.val = init_value; + } + + result_view_type result_view() const { + return result; + } +}; + +template<class Scalar, class Index> +struct MinMaxLocScalar { + Scalar min_val,max_val; + Index min_loc,max_loc; + + KOKKOS_INLINE_FUNCTION + void operator = (const MinMaxLocScalar& rhs) { + min_val = rhs.min_val; + min_loc = rhs.min_loc; + max_val = rhs.max_val; + max_loc = rhs.max_loc; + } + + KOKKOS_INLINE_FUNCTION + void operator = (const volatile MinMaxLocScalar& rhs) volatile { + min_val = rhs.min_val; + min_loc = rhs.min_loc; + max_val = rhs.max_val; + max_loc = rhs.max_loc; + } +}; + +template<class Scalar, class Index, class Space = HostSpace> +struct MinMaxLoc { +private: + typedef typename std::remove_cv<Scalar>::type scalar_type; + typedef typename std::remove_cv<Index>::type index_type; + +public: + //Required + typedef MinMaxLoc reducer_type; + typedef MinMaxLocScalar<scalar_type,index_type> value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + + scalar_type min_init_value; + scalar_type max_init_value; + +private: + result_view_type result; + + template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value > + struct MinInitWrapper; + + template<class ValueType > + struct MinInitWrapper<ValueType,true> { + static ValueType value() { + return std::numeric_limits<scalar_type>::max(); + } + }; + + template<class ValueType > + struct MinInitWrapper<ValueType,false> { + static ValueType value() { + return scalar_type(); + } + }; + + template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value > + struct MaxInitWrapper; + + template<class ValueType > + struct MaxInitWrapper<ValueType,true> { + static ValueType value() { + return std::numeric_limits<scalar_type>::min(); + } + }; + + template<class ValueType > + struct MaxInitWrapper<ValueType,false> { + static ValueType value() { + return scalar_type(); + } + }; + +public: + + MinMaxLoc(value_type& result_): + min_init_value(MinInitWrapper<scalar_type>::value()),max_init_value(MaxInitWrapper<scalar_type>::value()),result(&result_) {} + MinMaxLoc(const result_view_type& result_): + min_init_value(MinInitWrapper<scalar_type>::value()),max_init_value(MaxInitWrapper<scalar_type>::value()),result(result_) {} + MinMaxLoc(value_type& result_, const scalar_type& min_init_value_, const scalar_type& max_init_value_): + min_init_value(min_init_value_),max_init_value(max_init_value_),result(&result_) {} + MinMaxLoc(const result_view_type& result_, const scalar_type& min_init_value_, const scalar_type& max_init_value_): + min_init_value(min_init_value_),max_init_value(max_init_value_),result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if ( src.min_val < dest.min_val ) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } + if ( src.max_val > dest.max_val ) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if ( src.min_val < dest.min_val ) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } + if ( src.max_val > dest.max_val ) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val.min_val = min_init_value; + val.max_val = max_init_value; + } + + result_view_type result_view() const { + return result; + } +}; +} +} + + +namespace Kokkos { +namespace Impl { + +template< class T, class ReturnType , class ValueTraits> +struct ParallelReduceReturnValue; + +template< class ReturnType , class FunctorType > +struct ParallelReduceReturnValue<typename std::enable_if<Kokkos::is_view<ReturnType>::value>::type, ReturnType, FunctorType> { + typedef ReturnType return_type; + typedef InvalidType reducer_type; + + typedef typename return_type::value_type value_type_scalar; + typedef typename return_type::value_type value_type_array[]; + + typedef typename if_c<return_type::rank==0,value_type_scalar,value_type_array>::type value_type; + + static return_type& return_value(ReturnType& return_val, const FunctorType&) { + return return_val; + } +}; + +template< class ReturnType , class FunctorType> +struct ParallelReduceReturnValue<typename std::enable_if< + !Kokkos::is_view<ReturnType>::value && + (!std::is_array<ReturnType>::value && !std::is_pointer<ReturnType>::value) && + !Kokkos::is_reducer_type<ReturnType>::value + >::type, ReturnType, FunctorType> { + typedef Kokkos::View< ReturnType + , Kokkos::HostSpace + , Kokkos::MemoryUnmanaged + > return_type; + + typedef InvalidType reducer_type; + + typedef typename return_type::value_type value_type; + + static return_type return_value(ReturnType& return_val, const FunctorType&) { + return return_type(&return_val); + } +}; + +template< class ReturnType , class FunctorType> +struct ParallelReduceReturnValue<typename std::enable_if< + (is_array<ReturnType>::value || std::is_pointer<ReturnType>::value) + >::type, ReturnType, FunctorType> { + typedef Kokkos::View< typename std::remove_const<ReturnType>::type + , Kokkos::HostSpace + , Kokkos::MemoryUnmanaged + > return_type; + + typedef InvalidType reducer_type; + + typedef typename return_type::value_type value_type[]; + + static return_type return_value(ReturnType& return_val, + const FunctorType& functor) { + return return_type(return_val,functor.value_count); + } +}; + +template< class ReturnType , class FunctorType> +struct ParallelReduceReturnValue<typename std::enable_if< + Kokkos::is_reducer_type<ReturnType>::value + >::type, ReturnType, FunctorType> { + typedef ReturnType return_type; + typedef ReturnType reducer_type; + typedef typename return_type::value_type value_type; + + static return_type return_value(ReturnType& return_val, + const FunctorType& functor) { + return return_val; + } +}; +} + +namespace Impl { +template< class T, class ReturnType , class FunctorType> +struct ParallelReducePolicyType; + +template< class PolicyType , class FunctorType > +struct ParallelReducePolicyType<typename std::enable_if<Kokkos::Impl::is_execution_policy<PolicyType>::value>::type, PolicyType,FunctorType> { + + typedef PolicyType policy_type; + static PolicyType policy(const PolicyType& policy_) { + return policy_; + } +}; + +template< class PolicyType , class FunctorType > +struct ParallelReducePolicyType<typename std::enable_if<std::is_integral<PolicyType>::value>::type, PolicyType,FunctorType> { + typedef typename + Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space + execution_space ; + + typedef Kokkos::RangePolicy<execution_space> policy_type; + + static policy_type policy(const PolicyType& policy_) { + return policy_type(0,policy_); + } +}; + +} + +namespace Impl { + template< class FunctorType, class ExecPolicy, class ValueType, class ExecutionSpace> + struct ParallelReduceFunctorType { + typedef FunctorType functor_type; + static const functor_type& functor(const functor_type& functor) { + return functor; + } + }; +} + +namespace Impl { + + template< class PolicyType, class FunctorType, class ReturnType > + struct ParallelReduceAdaptor { + typedef Impl::ParallelReduceReturnValue<void,ReturnType,FunctorType> return_value_adapter; + #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER + typedef Impl::ParallelReduceFunctorType<FunctorType,PolicyType, + typename return_value_adapter::value_type, + typename PolicyType::execution_space> functor_adaptor; + #endif + static inline + void execute(const std::string& label, + const PolicyType& policy, + const FunctorType& functor, + ReturnType& return_value) { + #if (KOKKOS_ENABLE_PROFILING) + uint64_t kpID = 0; + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID); + } + #endif + + Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); + #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER + Impl::ParallelReduce<typename functor_adaptor::functor_type, PolicyType, typename return_value_adapter::reducer_type > + closure(functor_adaptor::functor(functor), + policy, + return_value_adapter::return_value(return_value,functor)); + #else + Impl::ParallelReduce<FunctorType, PolicyType, typename return_value_adapter::reducer_type > + closure(functor, + policy, + return_value_adapter::return_value(return_value,functor)); + #endif + Kokkos::Impl::shared_allocation_tracking_release_and_enable(); + closure.execute(); + + #if (KOKKOS_ENABLE_PROFILING) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelReduce(kpID); + } + #endif + } + + }; +} +/*! \fn void parallel_reduce(label,policy,functor,return_argument) + \brief Perform a parallel reduction. + \param label An optional Label giving the call name. Must be able to construct a std::string from the argument. + \param policy A Kokkos Execution Policy, such as an integer, a RangePolicy or a TeamPolicy. + \param functor A functor with a reduction operator, and optional init, join and final functions. + \param return_argument A return argument which can be a scalar, a View, or a ReducerStruct. This argument can be left out if the functor has a final function. +*/ + +/** \brief Parallel reduction + * + * parallel_reduce performs parallel reductions with arbitrary functions - i.e. + * it is not solely data based. The call expects up to 4 arguments: + * + * + * Example of a parallel_reduce functor for a POD (plain old data) value type: + * \code + * class FunctorType { // For POD value type + * public: + * typedef ... execution_space ; + * typedef <podType> value_type ; + * void operator()( <intType> iwork , <podType> & update ) const ; + * void init( <podType> & update ) const ; + * void join( volatile <podType> & update , + * volatile const <podType> & input ) const ; + * + * typedef true_type has_final ; + * void final( <podType> & update ) const ; + * }; + * \endcode + * + * Example of a parallel_reduce functor for an array of POD (plain old data) values: + * \code + * class FunctorType { // For array of POD value + * public: + * typedef ... execution_space ; + * typedef <podType> value_type[] ; + * void operator()( <intType> , <podType> update[] ) const ; + * void init( <podType> update[] ) const ; + * void join( volatile <podType> update[] , + * volatile const <podType> input[] ) const ; + * + * typedef true_type has_final ; + * void final( <podType> update[] ) const ; + * }; + * \endcode + */ + +// ReturnValue is scalar or array: take by reference + +template< class PolicyType, class FunctorType, class ReturnType > +inline +void parallel_reduce(const std::string& label, + const PolicyType& policy, + const FunctorType& functor, + ReturnType& return_value, + typename Impl::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value + >::type * = 0) { + Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute(label,policy,functor,return_value); +} + +template< class PolicyType, class FunctorType, class ReturnType > +inline +void parallel_reduce(const PolicyType& policy, + const FunctorType& functor, + ReturnType& return_value, + typename Impl::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value + >::type * = 0) { + Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute("",policy,functor,return_value); +} + +template< class FunctorType, class ReturnType > +inline +void parallel_reduce(const size_t& policy, + const FunctorType& functor, + ReturnType& return_value) { + typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type; + Impl::ParallelReduceAdaptor<policy_type,FunctorType,ReturnType>::execute("",policy_type(0,policy),functor,return_value); +} + +template< class FunctorType, class ReturnType > +inline +void parallel_reduce(const std::string& label, + const size_t& policy, + const FunctorType& functor, + ReturnType& return_value) { + typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type; + Impl::ParallelReduceAdaptor<policy_type,FunctorType,ReturnType>::execute(label,policy_type(0,policy),functor,return_value); +} + +// ReturnValue as View or Reducer: take by copy to allow for inline construction + +template< class PolicyType, class FunctorType, class ReturnType > +inline +void parallel_reduce(const std::string& label, + const PolicyType& policy, + const FunctorType& functor, + const ReturnType& return_value, + typename Impl::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value + >::type * = 0) { + Impl::ParallelReduceAdaptor<PolicyType,FunctorType,const ReturnType>::execute(label,policy,functor,return_value); +} + +template< class PolicyType, class FunctorType, class ReturnType > +inline +void parallel_reduce(const PolicyType& policy, + const FunctorType& functor, + const ReturnType& return_value, + typename Impl::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value + >::type * = 0) { + Impl::ParallelReduceAdaptor<PolicyType,FunctorType,const ReturnType>::execute("",policy,functor,return_value); +} + +template< class FunctorType, class ReturnType > +inline +void parallel_reduce(const size_t& policy, + const FunctorType& functor, + const ReturnType& return_value) { + typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type; + + Impl::ParallelReduceAdaptor<policy_type,FunctorType,const ReturnType>::execute("",policy_type(0,policy),functor,return_value); +} + +template< class FunctorType, class ReturnType > +inline +void parallel_reduce(const std::string& label, + const size_t& policy, + const FunctorType& functor, + const ReturnType& return_value) { + typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type; + Impl::ParallelReduceAdaptor<policy_type,FunctorType,const ReturnType>::execute(label,policy_type(0,policy),functor,return_value); +} + +// No Return Argument + +template< class PolicyType, class FunctorType> +inline +void parallel_reduce(const std::string& label, + const PolicyType& policy, + const FunctorType& functor, + typename Impl::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value + >::type * = 0) { + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + + typedef Kokkos::View< value_type + , Kokkos::HostSpace + , Kokkos::MemoryUnmanaged + > result_view_type; + result_view_type result_view ; + + Impl::ParallelReduceAdaptor<PolicyType,FunctorType,result_view_type>::execute(label,policy,functor,result_view); +} + +template< class PolicyType, class FunctorType > +inline +void parallel_reduce(const PolicyType& policy, + const FunctorType& functor, + typename Impl::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value + >::type * = 0) { + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + + typedef Kokkos::View< value_type + , Kokkos::HostSpace + , Kokkos::MemoryUnmanaged + > result_view_type; + result_view_type result_view ; + + Impl::ParallelReduceAdaptor<PolicyType,FunctorType,result_view_type>::execute("",policy,functor,result_view); +} + +template< class FunctorType > +inline +void parallel_reduce(const size_t& policy, + const FunctorType& functor) { + typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + + typedef Kokkos::View< value_type + , Kokkos::HostSpace + , Kokkos::MemoryUnmanaged + > result_view_type; + result_view_type result_view ; + + Impl::ParallelReduceAdaptor<policy_type,FunctorType,result_view_type>::execute("",policy_type(0,policy),functor,result_view); +} + +template< class FunctorType> +inline +void parallel_reduce(const std::string& label, + const size_t& policy, + const FunctorType& functor) { + typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + + typedef Kokkos::View< value_type + , Kokkos::HostSpace + , Kokkos::MemoryUnmanaged + > result_view_type; + result_view_type result_view ; + + Impl::ParallelReduceAdaptor<policy_type,FunctorType,result_view_type>::execute(label,policy_type(0,policy),functor,result_view); +} + + + +} //namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp index 17654170ed..09a5993863 100644 --- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp @@ -66,11 +66,15 @@ public: private: - mutable char * m_iter ; - char * m_end ; + mutable char * m_iter_L0 ; + char * m_end_L0 ; + mutable char * m_iter_L1 ; + char * m_end_L1 ; + mutable int m_multiplier; mutable int m_offset; + mutable int m_default_level; ScratchMemorySpace(); ScratchMemorySpace & operator = ( const ScratchMemorySpace & ); @@ -95,34 +99,58 @@ public: template< typename IntType > KOKKOS_INLINE_FUNCTION - void* get_shmem (const IntType& size) const { - void* tmp = m_iter + m_offset * align (size); - if (m_end < (m_iter += align (size) * m_multiplier)) { - m_iter -= align (size) * m_multiplier; // put it back like it was - #ifdef KOKKOS_HAVE_DEBUG - // mfh 23 Jun 2015: printf call consumes 25 registers - // in a CUDA build, so only print in debug mode. The - // function still returns NULL if not enough memory. - printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate " - "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size), - long(m_end-m_iter)); - #endif // KOKKOS_HAVE_DEBUG - tmp = 0; + void* get_shmem (const IntType& size, int level = -1) const { + if(level == -1) + level = m_default_level; + if(level == 0) { + void* tmp = m_iter_L0 + m_offset * align (size); + if (m_end_L0 < (m_iter_L0 += align (size) * m_multiplier)) { + m_iter_L0 -= align (size) * m_multiplier; // put it back like it was + #ifdef KOKKOS_HAVE_DEBUG + // mfh 23 Jun 2015: printf call consumes 25 registers + // in a CUDA build, so only print in debug mode. The + // function still returns NULL if not enough memory. + printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate " + "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size), + long(m_end_L0-m_iter_L0)); + #endif // KOKKOS_HAVE_DEBUG + tmp = 0; + } + return tmp; + } else { + void* tmp = m_iter_L1 + m_offset * align (size); + if (m_end_L1 < (m_iter_L1 += align (size) * m_multiplier)) { + m_iter_L1 -= align (size) * m_multiplier; // put it back like it was + #ifdef KOKKOS_HAVE_DEBUG + // mfh 23 Jun 2015: printf call consumes 25 registers + // in a CUDA build, so only print in debug mode. The + // function still returns NULL if not enough memory. + printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate " + "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size), + long(m_end_L1-m_iter_L1)); + #endif // KOKKOS_HAVE_DEBUG + tmp = 0; + } + return tmp; + } - return tmp; } template< typename IntType > KOKKOS_INLINE_FUNCTION - ScratchMemorySpace( void * ptr , const IntType & size ) - : m_iter( (char *) ptr ) - , m_end( m_iter + size ) + ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0) + : m_iter_L0( (char *) ptr_L0 ) + , m_end_L0( m_iter_L0 + size_L0 ) + , m_iter_L1( (char *) ptr_L1 ) + , m_end_L1( m_iter_L1 + size_L1 ) , m_multiplier( 1 ) , m_offset( 0 ) + , m_default_level( 0 ) {} KOKKOS_INLINE_FUNCTION - const ScratchMemorySpace& set_team_thread_mode(const int& multiplier, const int& offset) const { + const ScratchMemorySpace& set_team_thread_mode(const int& level, const int& multiplier, const int& offset) const { + m_default_level = level; m_multiplier = multiplier; m_offset = offset; return *this; diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp index 656be5d09f..233b56c939 100644 --- a/lib/kokkos/core/src/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Kokkos_Serial.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -50,12 +50,17 @@ #include <cstddef> #include <iosfwd> #include <Kokkos_Parallel.hpp> +#include <Kokkos_TaskPolicy.hpp> #include <Kokkos_Layout.hpp> #include <Kokkos_HostSpace.hpp> #include <Kokkos_ScratchSpace.hpp> #include <Kokkos_MemoryTraits.hpp> #include <impl/Kokkos_Tags.hpp> #include <impl/Kokkos_FunctorAdapter.hpp> +#include <impl/Kokkos_Profiling_Interface.hpp> + + +#include <KokkosExp_MDRangePolicy.hpp> #if defined( KOKKOS_HAVE_SERIAL ) @@ -142,7 +147,9 @@ public: // Init the array of locks used for arbitrarily sized atomics Impl::init_lock_array_host_space(); - + #if (KOKKOS_ENABLE_PROFILING) + Kokkos::Profiling::initialize(); + #endif } static int is_initialized() { return 1 ; } @@ -151,7 +158,11 @@ public: static int concurrency() {return 1;}; //! Free any resources being consumed by the device. - static void finalize() {} + static void finalize() { + #if (KOKKOS_ENABLE_PROFILING) + Kokkos::Profiling::finalize(); + #endif + } //! Print configuration information to the given output stream. static void print_configuration( std::ostream & , const bool /* detail */ = false ) {} @@ -307,8 +318,8 @@ class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits< { private: - size_t m_team_scratch_size ; - size_t m_thread_scratch_size ; + size_t m_team_scratch_size[2] ; + size_t m_thread_scratch_size[2] ; int m_league_size ; int m_chunk_size; @@ -324,8 +335,10 @@ public: TeamPolicyInternal& operator = (const TeamPolicyInternal& p) { m_league_size = p.m_league_size; - m_team_scratch_size = p.m_team_scratch_size; - m_thread_scratch_size = p.m_thread_scratch_size; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; m_chunk_size = p.m_chunk_size; return *this; } @@ -348,15 +361,15 @@ public: inline int team_size() const { return 1 ; } inline int league_size() const { return m_league_size ; } - inline size_t scratch_size() const { return m_team_scratch_size + m_thread_scratch_size; } + inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; } /** \brief Specify league size, request team size */ TeamPolicyInternal( execution_space & , int league_size_request , int /* team_size_request */ , int /* vector_length_request */ = 1 ) - : m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_league_size( league_size_request ) , m_chunk_size ( 32 ) {} @@ -365,8 +378,8 @@ public: , int league_size_request , const Kokkos::AUTO_t & /* team_size_request */ , int /* vector_length_request */ = 1 ) - : m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_league_size( league_size_request ) , m_chunk_size ( 32 ) {} @@ -374,8 +387,8 @@ public: TeamPolicyInternal( int league_size_request , int /* team_size_request */ , int /* vector_length_request */ = 1 ) - : m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_league_size( league_size_request ) , m_chunk_size ( 32 ) {} @@ -383,8 +396,8 @@ public: TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & /* team_size_request */ , int /* vector_length_request */ = 1 ) - : m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_league_size( league_size_request ) , m_chunk_size ( 32 ) {} @@ -401,26 +414,23 @@ public: /** \brief set per team scratch size for a specific level of the scratch hierarchy */ inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const { - (void) level; TeamPolicyInternal p = *this; - p.m_team_scratch_size = per_team.value; + p.m_team_scratch_size[level] = per_team.value; return p; }; /** \brief set per thread scratch size for a specific level of the scratch hierarchy */ inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const { - (void) level; TeamPolicyInternal p = *this; - p.m_thread_scratch_size = per_thread.value; + p.m_thread_scratch_size[level] = per_thread.value; return p; }; /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */ inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const { - (void) level; TeamPolicyInternal p = *this; - p.m_team_scratch_size = per_team.value; - p.m_thread_scratch_size = per_thread.value; + p.m_team_scratch_size[level] = per_team.value; + p.m_thread_scratch_size[level] = per_thread.value; return p; }; @@ -440,7 +450,7 @@ namespace Kokkos { namespace Impl { template< class FunctorType , class ... Traits > -class ParallelFor< FunctorType , +class ParallelFor< FunctorType , Kokkos::RangePolicy< Traits ... > , Kokkos::Serial > @@ -489,9 +499,10 @@ public: /*--------------------------------------------------------------------------*/ -template< class FunctorType , class ... Traits > +template< class FunctorType , class ReducerType , class ... Traits > class ParallelReduce< FunctorType , Kokkos::RangePolicy< Traits ... > + , ReducerType , Kokkos::Serial > { @@ -499,14 +510,19 @@ private: typedef Kokkos::RangePolicy< Traits ... > Policy ; typedef typename Policy::work_tag WorkTag ; - typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ; + + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; const FunctorType m_functor ; const Policy m_policy ; + const ReducerType m_reducer ; const pointer_type m_result_ptr ; @@ -515,15 +531,15 @@ private: typename std::enable_if< std::is_same< TagType , void >::value >::type exec( pointer_type ptr ) const { - reference_type update = ValueInit::init( m_functor , ptr ); + reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr ); const typename Policy::member_type e = m_policy.end(); for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) { m_functor( i , update ); } - Kokkos::Impl::FunctorFinal< FunctorType , TagType >:: - final( m_functor , ptr ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >:: + final( ReducerConditional::select(m_functor , m_reducer) , ptr ); } template< class TagType > @@ -532,15 +548,15 @@ private: exec( pointer_type ptr ) const { const TagType t{} ; - reference_type update = ValueInit::init( m_functor , ptr ); + reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr ); const typename Policy::member_type e = m_policy.end(); for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) { m_functor( t , i , update ); } - Kokkos::Impl::FunctorFinal< FunctorType , TagType >:: - final( m_functor , ptr ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >:: + final( ReducerConditional::select(m_functor , m_reducer) , ptr ); } public: @@ -549,25 +565,43 @@ public: void execute() const { pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize - ( ValueTraits::value_size( m_functor ) , 0 ); + ( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 ); this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr ); } - template< class ViewType > + template< class HostViewType > + ParallelReduce( const FunctorType & arg_functor , + const Policy & arg_policy , + const HostViewType & arg_result_view , + typename std::enable_if< + Kokkos::is_view< HostViewType >::value && + !Kokkos::is_reducer_type<ReducerType>::value + ,void*>::type = NULL) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + , m_reducer( InvalidType() ) + , m_result_ptr( arg_result_view.ptr_on_device() ) + { + static_assert( Kokkos::is_view< HostViewType >::value + , "Kokkos::Serial reduce result must be a View" ); + + static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value + , "Kokkos::Serial reduce result must be a View in HostSpace" ); + } + + inline ParallelReduce( const FunctorType & arg_functor - , const Policy & arg_policy - , const ViewType & arg_result ) + , Policy arg_policy + , const ReducerType& reducer ) : m_functor( arg_functor ) , m_policy( arg_policy ) - , m_result_ptr( arg_result.ptr_on_device() ) + , m_reducer( reducer ) + , m_result_ptr( reducer.result_view().data() ) { - static_assert( Kokkos::is_view< ViewType >::value - , "Reduction result on Kokkos::Serial must be a Kokkos::View" ); - - static_assert( std::is_same< typename ViewType::memory_space + /*static_assert( std::is_same< typename ViewType::memory_space , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" ); + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ } }; @@ -697,15 +731,16 @@ public: , const Policy & arg_policy ) : m_functor( arg_functor ) , m_league( arg_policy.league_size() ) - , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) ) + , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) ) { } }; /*--------------------------------------------------------------------------*/ -template< class FunctorType , class ... Properties > +template< class FunctorType , class ReducerType , class ... Properties > class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Properties ... > + , ReducerType , Kokkos::Serial > { @@ -714,30 +749,35 @@ private: typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ; typedef typename Policy::member_type Member ; typedef typename Policy::work_tag WorkTag ; - typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ; + + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; const FunctorType m_functor ; const int m_league ; - const int m_shared ; + const ReducerType m_reducer ; pointer_type m_result_ptr ; + const int m_shared ; template< class TagType > inline typename std::enable_if< std::is_same< TagType , void >::value >::type exec( pointer_type ptr ) const { - reference_type update = ValueInit::init( m_functor , ptr ); + reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr ); for ( int ileague = 0 ; ileague < m_league ; ++ileague ) { m_functor( Member(ileague,m_league,m_shared) , update ); } - Kokkos::Impl::FunctorFinal< FunctorType , TagType >:: - final( m_functor , ptr ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >:: + final( ReducerConditional::select(m_functor , m_reducer) , ptr ); } template< class TagType > @@ -747,14 +787,14 @@ private: { const TagType t{} ; - reference_type update = ValueInit::init( m_functor , ptr ); + reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr ); for ( int ileague = 0 ; ileague < m_league ; ++ileague ) { m_functor( t , Member(ileague,m_league,m_shared) , update ); } - Kokkos::Impl::FunctorFinal< FunctorType , TagType >:: - final( m_functor , ptr ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >:: + final( ReducerConditional::select(m_functor , m_reducer) , ptr ); } public: @@ -763,7 +803,7 @@ public: void execute() const { pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize - ( ValueTraits::value_size( m_functor ) , m_shared ); + ( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , m_shared ); this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr ); } @@ -771,12 +811,16 @@ public: template< class ViewType > ParallelReduce( const FunctorType & arg_functor , const Policy & arg_policy - , const ViewType & arg_result - ) + , const ViewType & arg_result , + typename std::enable_if< + Kokkos::is_view< ViewType >::value && + !Kokkos::is_reducer_type<ReducerType>::value + ,void*>::type = NULL) : m_functor( arg_functor ) , m_league( arg_policy.league_size() ) - , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) ) + , m_reducer( InvalidType() ) , m_result_ptr( arg_result.ptr_on_device() ) + , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) ) { static_assert( Kokkos::is_view< ViewType >::value , "Reduction result on Kokkos::Serial must be a Kokkos::View" ); @@ -786,6 +830,21 @@ public: , "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" ); } + inline + ParallelReduce( const FunctorType & arg_functor + , Policy arg_policy + , const ReducerType& reducer ) + : m_functor( arg_functor ) + , m_league( arg_policy.league_size() ) + , m_reducer( reducer ) + , m_result_ptr( reducer.result_view().data() ) + , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } + }; } // namespace Impl @@ -1045,6 +1104,10 @@ void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const Func } } +//---------------------------------------------------------------------------- + +#include <impl/Kokkos_Serial_Task.hpp> + #endif // defined( KOKKOS_HAVE_SERIAL ) #endif /* #define KOKKOS_SERIAL_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp index 5f999e9a34..fc9113b750 100644 --- a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp @@ -1,4 +1,3 @@ - /* //@HEADER // ************************************************************************ @@ -47,13 +46,655 @@ #ifndef KOKKOS_TASKPOLICY_HPP #define KOKKOS_TASKPOLICY_HPP +//---------------------------------------------------------------------------- + #include <Kokkos_Core_fwd.hpp> + +// If compiling with CUDA then must be using CUDA 8 or better +// and use relocateable device code to enable the task policy. +// nvcc relocatable device code option: --relocatable-device-code=true + +#if ( defined( KOKKOS_COMPILER_NVCC ) ) + #if ( 8000 <= CUDA_VERSION ) && \ + defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) + + #define KOKKOS_ENABLE_TASKPOLICY + + #endif +#else + +#define KOKKOS_ENABLE_TASKPOLICY + +#endif + + +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + +//---------------------------------------------------------------------------- + #include <Kokkos_MemoryPool.hpp> -#include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_Tags.hpp> -#include <impl/Kokkos_StaticAssert.hpp> -#include <impl/Kokkos_AllocationTracker.hpp> +#include <impl/Kokkos_TaskQueue.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +enum TaskType { TaskTeam = Impl::TaskBase<void,void,void>::TaskTeam + , TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle }; + +enum TaskPriority { TaskHighPriority = 0 + , TaskRegularPriority = 1 + , TaskLowPriority = 2 }; + +template< typename Space > +class TaskPolicy ; + +template< typename Space > +void wait( TaskPolicy< Space > const & ); + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/*\brief Implementation data for task data management, access, and execution. + * + * CRTP Inheritance structure to allow static_cast from the + * task root type and a task's FunctorType. + * + * TaskBase< Space , ResultType , FunctorType > + * : TaskBase< Space , ResultType , void > + * , FunctorType + * { ... }; + * + * TaskBase< Space , ResultType , void > + * : TaskBase< Space , void , void > + * { ... }; + */ +template< typename Space , typename ResultType , typename FunctorType > +class TaskBase ; + +template< typename Space > +class TaskExec ; + +}} // namespace Kokkos::Impl + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** + * + * Future< space > // value_type == void + * Future< value > // space == Default + * Future< value , space > + * + */ +template< typename Arg1 /* = void */ , typename Arg2 /* = void */ > +class Future { +private: + + template< typename > friend class TaskPolicy ; + template< typename , typename > friend class Future ; + template< typename , typename , typename > friend class Impl::TaskBase ; + + enum { Arg1_is_space = Kokkos::Impl::is_space< Arg1 >::value }; + enum { Arg2_is_space = Kokkos::Impl::is_space< Arg2 >::value }; + enum { Arg1_is_value = ! Arg1_is_space && + ! std::is_same< Arg1 , void >::value }; + enum { Arg2_is_value = ! Arg2_is_space && + ! std::is_same< Arg2 , void >::value }; + + static_assert( ! ( Arg1_is_space && Arg2_is_space ) + , "Future cannot be given two spaces" ); + + static_assert( ! ( Arg1_is_value && Arg2_is_value ) + , "Future cannot be given two value types" ); + + using ValueType = + typename std::conditional< Arg1_is_value , Arg1 , + typename std::conditional< Arg2_is_value , Arg2 , void + >::type >::type ; + + using Space = + typename std::conditional< Arg1_is_space , Arg1 , + typename std::conditional< Arg2_is_space , Arg2 , void + >::type >::type ; + + using task_base = Impl::TaskBase< Space , ValueType , void > ; + using queue_type = Impl::TaskQueue< Space > ; + + task_base * m_task ; + + KOKKOS_INLINE_FUNCTION explicit + Future( task_base * task ) : m_task(0) + { if ( task ) queue_type::assign( & m_task , task ); } + + //---------------------------------------- + +public: + + using execution_space = typename Space::execution_space ; + using value_type = ValueType ; + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + bool is_null() const { return 0 == m_task ; } + + KOKKOS_INLINE_FUNCTION + int reference_count() const + { return 0 != m_task ? m_task->reference_count() : 0 ; } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + ~Future() { if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + constexpr Future() noexcept : m_task(0) {} + + KOKKOS_INLINE_FUNCTION + Future( Future && rhs ) + : m_task( rhs.m_task ) { rhs.m_task = 0 ; } + + KOKKOS_INLINE_FUNCTION + Future( const Future & rhs ) + : m_task(0) + { if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); } + + KOKKOS_INLINE_FUNCTION + Future & operator = ( Future && rhs ) + { + if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); + m_task = rhs.m_task ; + rhs.m_task = 0 ; + return *this ; + } + + KOKKOS_INLINE_FUNCTION + Future & operator = ( const Future & rhs ) + { + if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); + return *this ; + } + + //---------------------------------------- + + template< class A1 , class A2 > + KOKKOS_INLINE_FUNCTION + Future( Future<A1,A2> && rhs ) + : m_task( rhs.m_task ) + { + static_assert + ( std::is_same< Space , void >::value || + std::is_same< Space , typename Future<A1,A2>::Space >::value + , "Assigned Futures must have the same space" ); + + static_assert + ( std::is_same< value_type , void >::value || + std::is_same< value_type , typename Future<A1,A2>::value_type >::value + , "Assigned Futures must have the same value_type" ); + + rhs.m_task = 0 ; + } + + template< class A1 , class A2 > + KOKKOS_INLINE_FUNCTION + Future( const Future<A1,A2> & rhs ) + : m_task(0) + { + static_assert + ( std::is_same< Space , void >::value || + std::is_same< Space , typename Future<A1,A2>::Space >::value + , "Assigned Futures must have the same space" ); + + static_assert + ( std::is_same< value_type , void >::value || + std::is_same< value_type , typename Future<A1,A2>::value_type >::value + , "Assigned Futures must have the same value_type" ); + + if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); + } + + template< class A1 , class A2 > + KOKKOS_INLINE_FUNCTION + Future & operator = ( const Future<A1,A2> & rhs ) + { + static_assert + ( std::is_same< Space , void >::value || + std::is_same< Space , typename Future<A1,A2>::Space >::value + , "Assigned Futures must have the same space" ); + + static_assert + ( std::is_same< value_type , void >::value || + std::is_same< value_type , typename Future<A1,A2>::value_type >::value + , "Assigned Futures must have the same value_type" ); + + if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); + return *this ; + } + + template< class A1 , class A2 > + KOKKOS_INLINE_FUNCTION + Future & operator = ( Future<A1,A2> && rhs ) + { + static_assert + ( std::is_same< Space , void >::value || + std::is_same< Space , typename Future<A1,A2>::Space >::value + , "Assigned Futures must have the same space" ); + + static_assert + ( std::is_same< value_type , void >::value || + std::is_same< value_type , typename Future<A1,A2>::value_type >::value + , "Assigned Futures must have the same value_type" ); + + if ( m_task ) queue_type::assign( & m_task , (task_base*) 0 ); + m_task = rhs.m_task ; + rhs.m_task = 0 ; + return *this ; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + typename task_base::get_return_type + get() const + { + if ( 0 == m_task ) { + Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()"); + } + return m_task->get(); + } +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template< typename ExecSpace > +class TaskPolicy +{ +private: + + using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ; + using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ; + using task_base = Impl::TaskBase< ExecSpace , void , void > ; + + track_type m_track ; + queue_type * m_queue ; + + //---------------------------------------- + // Process optional arguments to spawn and respawn functions + + KOKKOS_INLINE_FUNCTION static + void assign( task_base * const ) {} + + // TaskTeam or TaskSingle + template< typename ... Options > + KOKKOS_INLINE_FUNCTION static + void assign( task_base * const task + , TaskType const & arg + , Options const & ... opts ) + { + task->m_task_type = arg ; + assign( task , opts ... ); + } + + // TaskHighPriority or TaskRegularPriority or TaskLowPriority + template< typename ... Options > + KOKKOS_INLINE_FUNCTION static + void assign( task_base * const task + , TaskPriority const & arg + , Options const & ... opts ) + { + task->m_priority = arg ; + assign( task , opts ... ); + } + + // Future for a dependence + template< typename A1 , typename A2 , typename ... Options > + KOKKOS_INLINE_FUNCTION static + void assign( task_base * const task + , Future< A1 , A2 > const & arg + , Options const & ... opts ) + { + // Assign dependence to task->m_next + // which will be processed within subsequent call to schedule. + // Error if the dependence is reset. + + if ( 0 != Kokkos::atomic_exchange(& task->m_next, arg.m_task) ) { + Kokkos::abort("TaskPolicy ERROR: resetting task dependence"); + } + + if ( 0 != arg.m_task ) { + // The future may be destroyed upon returning from this call + // so increment reference count to track this assignment. + Kokkos::atomic_fetch_add( &(arg.m_task->m_ref_count) , 1 ); + } + + assign( task , opts ... ); + } + + //---------------------------------------- + +public: + + using execution_policy = TaskPolicy ; + using execution_space = ExecSpace ; + using memory_space = typename queue_type::memory_space ; + using member_type = Kokkos::Impl::TaskExec< ExecSpace > ; + + KOKKOS_INLINE_FUNCTION + TaskPolicy() : m_track(), m_queue(0) {} + + KOKKOS_INLINE_FUNCTION + TaskPolicy( TaskPolicy && rhs ) = default ; + + KOKKOS_INLINE_FUNCTION + TaskPolicy( TaskPolicy const & rhs ) = default ; + + KOKKOS_INLINE_FUNCTION + TaskPolicy & operator = ( TaskPolicy && rhs ) = default ; + + KOKKOS_INLINE_FUNCTION + TaskPolicy & operator = ( TaskPolicy const & rhs ) = default ; + + TaskPolicy( memory_space const & arg_memory_space + , unsigned const arg_memory_pool_capacity + , unsigned const arg_memory_pool_log2_superblock = 12 ) + : m_track() + , m_queue(0) + { + typedef Kokkos::Experimental::Impl::SharedAllocationRecord + < memory_space , typename queue_type::Destroy > + record_type ; + + record_type * record = + record_type::allocate( arg_memory_space + , "TaskQueue" + , sizeof(queue_type) + ); + + m_queue = new( record->data() ) + queue_type( arg_memory_space + , arg_memory_pool_capacity + , arg_memory_pool_log2_superblock ); + + record->m_destroy.m_queue = m_queue ; + + m_track.assign_allocated_record_to_uninitialized( record ); + } + + //---------------------------------------- + /**\brief Allocation size for a spawned task */ + template< typename FunctorType > + KOKKOS_FUNCTION + size_t spawn_allocation_size() const + { + using task_type = Impl::TaskBase< execution_space + , typename FunctorType::value_type + , FunctorType > ; + + return m_queue->allocate_block_size( sizeof(task_type) ); + } + + /**\brief Allocation size for a when_all aggregate */ + KOKKOS_FUNCTION + size_t when_all_allocation_size( int narg ) const + { + using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ; + + return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) ); + } + + //---------------------------------------- + + /**\brief A task spawns a task with options + * + * 1) High, Normal, or Low priority + * 2) With or without dependence + * 3) Team or Serial + */ + template< typename FunctorType , typename ... Options > + KOKKOS_FUNCTION + Future< typename FunctorType::value_type , ExecSpace > + task_spawn( FunctorType const & arg_functor + , Options const & ... arg_options + ) const + { + using value_type = typename FunctorType::value_type ; + using future_type = Future< value_type , execution_space > ; + using task_type = Impl::TaskBase< execution_space + , value_type + , FunctorType > ; + + //---------------------------------------- + // Give single-thread back-ends an opportunity to clear + // queue of ready tasks before allocating a new task + + m_queue->iff_single_thread_recursive_execute(); + + //---------------------------------------- + + future_type f ; + + // Allocate task from memory pool + f.m_task = + reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type))); + if ( f.m_task ) { + + // Placement new construction + new ( f.m_task ) task_type( arg_functor ); + + // Reference count starts at two + // +1 for matching decrement when task is complete + // +1 for future + f.m_task->m_queue = m_queue ; + f.m_task->m_ref_count = 2 ; + f.m_task->m_alloc_size = sizeof(task_type); + + assign( f.m_task , arg_options... ); + + // Spawning from within the execution space so the + // apply function pointer is guaranteed to be valid + f.m_task->m_apply = task_type::apply ; + + m_queue->schedule( f.m_task ); + // this task may be updated or executed at any moment + } + + return f ; + } + + /**\brief The host process spawns a task with options + * + * 1) High, Normal, or Low priority + * 2) With or without dependence + * 3) Team or Serial + */ + template< typename FunctorType , typename ... Options > + inline + Future< typename FunctorType::value_type , ExecSpace > + host_spawn( FunctorType const & arg_functor + , Options const & ... arg_options + ) const + { + using value_type = typename FunctorType::value_type ; + using future_type = Future< value_type , execution_space > ; + using task_type = Impl::TaskBase< execution_space + , value_type + , FunctorType > ; + + future_type f ; + + // Allocate task from memory pool + f.m_task = + reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) ); + + if ( f.m_task ) { + + // Placement new construction + new( f.m_task ) task_type( arg_functor ); + + // Reference count starts at two: + // +1 to match decrement when task completes + // +1 for the future + f.m_task->m_queue = m_queue ; + f.m_task->m_ref_count = 2 ; + f.m_task->m_alloc_size = sizeof(task_type); + + assign( f.m_task , arg_options... ); + + // Potentially spawning outside execution space so the + // apply function pointer must be obtained from execution space. + // Required for Cuda execution space function pointer. + queue_type::specialization::template + proc_set_apply< FunctorType >( & f.m_task->m_apply ); + + m_queue->schedule( f.m_task ); + } + return f ; + } + + /**\brief Return a future that is complete + * when all input futures are complete. + */ + template< typename A1 , typename A2 > + KOKKOS_FUNCTION + Future< ExecSpace > + when_all( int narg , Future< A1 , A2 > const * const arg ) const + { + static_assert + ( std::is_same< execution_space + , typename Future< A1 , A2 >::execution_space + >::value + , "Future must have same execution space" ); + + using future_type = Future< ExecSpace > ; + using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ; + + future_type f ; + + size_t const size = sizeof(task_base) + narg * sizeof(task_base*); + + f.m_task = + reinterpret_cast< task_base * >( m_queue->allocate( size ) ); + + if ( f.m_task ) { + + new( f.m_task ) task_base(); + + // Reference count starts at two: + // +1 to match decrement when task completes + // +1 for the future + f.m_task->m_queue = m_queue ; + f.m_task->m_ref_count = 2 ; + f.m_task->m_alloc_size = size ; + f.m_task->m_dep_count = narg ; + f.m_task->m_task_type = task_base::Aggregate ; + + task_base ** const dep = f.m_task->aggregate_dependences(); + + // Assign dependences to increment their reference count + // The futures may be destroyed upon returning from this call + // so increment reference count to track this assignment. + + for ( int i = 0 ; i < narg ; ++i ) { + task_base * const t = dep[i] = arg[i].m_task ; + if ( 0 != t ) { + Kokkos::atomic_fetch_add( &(t->m_ref_count) , 1 ); + } + } + + m_queue->schedule( f.m_task ); + // this when_all may be processed at any moment + } + + return f ; + } + + /**\brief An executing task respawns itself with options + * + * 1) High, Normal, or Low priority + * 2) With or without dependence + */ + template< class FunctorType , typename ... Options > + KOKKOS_FUNCTION + void respawn( FunctorType * task_self + , Options const & ... arg_options ) const + { + using value_type = typename FunctorType::value_type ; + using task_type = Impl::TaskBase< execution_space + , value_type + , FunctorType > ; + + task_base * const zero = (task_base *) 0 ; + task_base * const lock = (task_base *) task_base::LockTag ; + task_type * const task = static_cast< task_type * >( task_self ); + + // Precondition: + // task is in Executing state + // therefore m_next == LockTag + // + // Change to m_next == 0 for no dependence + + if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) { + Kokkos::abort("TaskPolicy::respawn ERROR: already respawned"); + } + + assign( task , arg_options... ); + + // Postcondition: + // task is in Executing-Respawn state + // therefore m_next == dependece or 0 + } + + //---------------------------------------- + + template< typename S > + friend + void Kokkos::wait( Kokkos::TaskPolicy< S > const & ); + + //---------------------------------------- + + inline + int allocation_capacity() const noexcept + { return m_queue->m_memory.get_mem_size(); } + + KOKKOS_INLINE_FUNCTION + int allocated_task_count() const noexcept + { return m_queue->m_count_alloc ; } + + KOKKOS_INLINE_FUNCTION + int allocated_task_count_max() const noexcept + { return m_queue->m_max_alloc ; } + + KOKKOS_INLINE_FUNCTION + long allocated_task_count_accum() const noexcept + { return m_queue->m_accum_alloc ; } + +}; + +template< typename ExecSpace > +inline +void wait( TaskPolicy< ExecSpace > const & policy ) +{ policy.m_queue->execute(); } + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { @@ -463,5 +1104,6 @@ void wait( TaskPolicy< ExecSpace > & ); //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#endif /* #define KOKKOS_TASKPOLICY_HPP */ +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ +#endif /* #ifndef KOKKOS_TASKPOLICY_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_Threads.hpp b/lib/kokkos/core/src/Kokkos_Threads.hpp index 23efefce34..c9ebbf9265 100644 --- a/lib/kokkos/core/src/Kokkos_Threads.hpp +++ b/lib/kokkos/core/src/Kokkos_Threads.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -211,6 +211,8 @@ struct VerifyExecutionCanAccessMemorySpace #include <Threads/Kokkos_ThreadsTeam.hpp> #include <Threads/Kokkos_Threads_Parallel.hpp> +#include <KokkosExp_MDRangePolicy.hpp> + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index 2f98b4dfa5..1cc8b03381 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -46,69 +46,61 @@ #include <type_traits> #include <string> -#include <Kokkos_Core_fwd.hpp> - -#if ! KOKKOS_USING_EXP_VIEW +#include <algorithm> +#include <initializer_list> +#include <Kokkos_Core_fwd.hpp> #include <Kokkos_HostSpace.hpp> #include <Kokkos_MemoryTraits.hpp> - -#include <impl/Kokkos_StaticAssert.hpp> -#include <impl/Kokkos_Traits.hpp> -#include <impl/Kokkos_Shape.hpp> -#include <impl/Kokkos_AnalyzeShape.hpp> -#include <impl/Kokkos_Tags.hpp> - -// Must define before includng <impl/Kokkos_ViewOffset.hpp> -namespace Kokkos { struct ALL ; } - -#include <impl/Kokkos_ViewOffset.hpp> -#include <impl/Kokkos_ViewSupport.hpp> +#include <Kokkos_ExecPolicy.hpp> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { +namespace Experimental { namespace Impl { -/** \brief View specialization mapping of view traits to a specialization tag */ -template< class ValueType , - class ArraySpecialize , - class ArrayLayout , - class MemorySpace , - class MemoryTraits > -struct ViewSpecialize ; - -/** \brief Defines the type of a subview given a source view type - * and subview argument types. - */ -template< class SrcViewType - , class Arg0Type - , class Arg1Type - , class Arg2Type - , class Arg3Type - , class Arg4Type - , class Arg5Type - , class Arg6Type - , class Arg7Type - > -struct ViewSubview /* { typedef ... type ; } */ ; +template< class DstMemorySpace , class SrcMemorySpace > +struct DeepCopy ; -template< class DstViewSpecialize , - class SrcViewSpecialize = void , - class Enable = void > -struct ViewAssignment ; +template< class DataType > +struct ViewArrayAnalysis ; -template< class DstMemorySpace , class SrcMemorySpace , class ExecutionSpace> -struct DeepCopy ; +template< class DataType , class ArrayLayout + , typename ValueType = + typename ViewArrayAnalysis< DataType >::non_const_value_type + > +struct ViewDataAnalysis ; + +template< class , class ... > +class ViewMapping { public: enum { is_assignable = false }; }; + +template< class MemorySpace > +struct ViewOperatorBoundsErrorAbort ; + +template<> +struct ViewOperatorBoundsErrorAbort< Kokkos::HostSpace > { + static void apply( const size_t rank + , const size_t n0 , const size_t n1 + , const size_t n2 , const size_t n3 + , const size_t n4 , const size_t n5 + , const size_t n6 , const size_t n7 + , const size_t i0 , const size_t i1 + , const size_t i2 , const size_t i3 + , const size_t i4 , const size_t i5 + , const size_t i6 , const size_t i7 ); +}; } /* namespace Impl */ -} // namespace Kokkos +} /* namespace Experimental */ +} /* namespace Kokkos */ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { +namespace Experimental { /** \class ViewTraits * \brief Traits class for accessing attributes of a View. @@ -116,247 +108,189 @@ namespace Kokkos { * This is an implementation detail of View. It is only of interest * to developers implementing a new specialization of View. * - * Template argument permutations: - * - View< DataType , void , void , void > - * - View< DataType , Space , void , void > - * - View< DataType , Space , MemoryTraits , void > - * - View< DataType , Space , void , MemoryTraits > - * - View< DataType , ArrayLayout , void , void > - * - View< DataType , ArrayLayout , Space , void > - * - View< DataType , ArrayLayout , MemoryTraits , void > - * - View< DataType , ArrayLayout , Space , MemoryTraits > - * - View< DataType , MemoryTraits , void , void > + * Template argument options: + * - View< DataType > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , ArrayLayout > + * - View< DataType , ArrayLayout , Space > + * - View< DataType , ArrayLayout , MemoryTraits > + * - View< DataType , ArrayLayout , Space , MemoryTraits > + * - View< DataType , MemoryTraits > */ -template< class DataType , - class Arg1 = void , - class Arg2 = void , - class Arg3 = void > -class ViewTraits { -private: +template< class DataType , class ... Properties > +struct ViewTraits ; + +template<> +struct ViewTraits< void > +{ + typedef void execution_space ; + typedef void memory_space ; + typedef void HostMirrorSpace ; + typedef void array_layout ; + typedef void memory_traits ; +}; + +template< class ... Prop > +struct ViewTraits< void , void , Prop ... > +{ + // Ignore an extraneous 'void' + typedef typename ViewTraits<void,Prop...>::execution_space execution_space ; + typedef typename ViewTraits<void,Prop...>::memory_space memory_space ; + typedef typename ViewTraits<void,Prop...>::HostMirrorSpace HostMirrorSpace ; + typedef typename ViewTraits<void,Prop...>::array_layout array_layout ; + typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ; +}; + +template< class ArrayLayout , class ... Prop > +struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_array_layout<ArrayLayout>::value >::type , ArrayLayout , Prop ... > +{ + // Specify layout, keep subsequent space and memory traits arguments + + typedef typename ViewTraits<void,Prop...>::execution_space execution_space ; + typedef typename ViewTraits<void,Prop...>::memory_space memory_space ; + typedef typename ViewTraits<void,Prop...>::HostMirrorSpace HostMirrorSpace ; + typedef ArrayLayout array_layout ; + typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ; +}; + +template< class Space , class ... Prop > +struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_space<Space>::value >::type , Space , Prop ... > +{ + // Specify Space, memory traits should be the only subsequent argument. + + static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value && + std::is_same< typename ViewTraits<void,Prop...>::memory_space , void >::value && + std::is_same< typename ViewTraits<void,Prop...>::HostMirrorSpace , void >::value && + std::is_same< typename ViewTraits<void,Prop...>::array_layout , void >::value + , "Only one View Execution or Memory Space template argument" ); + + typedef typename Space::execution_space execution_space ; + typedef typename Space::memory_space memory_space ; + typedef typename Kokkos::Impl::is_space< Space >::host_mirror_space + HostMirrorSpace ; + typedef typename execution_space::array_layout array_layout ; + typedef typename ViewTraits<void,Prop...>::memory_traits memory_traits ; +}; + +template< class MemoryTraits , class ... Prop > +struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_memory_traits<MemoryTraits>::value >::type , MemoryTraits , Prop ... > +{ + // Specify memory trait, should not be any subsequent arguments + + static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value && + std::is_same< typename ViewTraits<void,Prop...>::memory_space , void >::value && + std::is_same< typename ViewTraits<void,Prop...>::array_layout , void >::value && + std::is_same< typename ViewTraits<void,Prop...>::memory_traits , void >::value + , "MemoryTrait is the final optional template argument for a View" ); + + typedef void execution_space ; + typedef void memory_space ; + typedef void HostMirrorSpace ; + typedef void array_layout ; + typedef MemoryTraits memory_traits ; +}; - // Layout, Space, and MemoryTraits are optional - // but need to appear in that order. That means Layout - // can only be Arg1, Space can be Arg1 or Arg2, and - // MemoryTraits can be Arg1, Arg2 or Arg3 - enum { Arg1IsLayout = Impl::is_array_layout<Arg1>::value }; +template< class DataType , class ... Properties > +struct ViewTraits { +private: - enum { Arg1IsSpace = Impl::is_space<Arg1>::value }; - enum { Arg2IsSpace = Impl::is_space<Arg2>::value }; + // Unpack the properties arguments + typedef ViewTraits< void , Properties ... > prop ; - enum { Arg1IsMemoryTraits = Impl::is_memory_traits<Arg1>::value }; - enum { Arg2IsMemoryTraits = Impl::is_memory_traits<Arg2>::value }; - enum { Arg3IsMemoryTraits = Impl::is_memory_traits<Arg3>::value }; + typedef typename + std::conditional< ! std::is_same< typename prop::execution_space , void >::value + , typename prop::execution_space + , Kokkos::DefaultExecutionSpace + >::type + ExecutionSpace ; - enum { Arg1IsVoid = Impl::is_same< Arg1 , void >::value }; - enum { Arg2IsVoid = Impl::is_same< Arg2 , void >::value }; - enum { Arg3IsVoid = Impl::is_same< Arg3 , void >::value }; + typedef typename + std::conditional< ! std::is_same< typename prop::memory_space , void >::value + , typename prop::memory_space + , typename ExecutionSpace::memory_space + >::type + MemorySpace ; - // Arg1 is Layout, Space, MemoryTraits, or void typedef typename - Impl::StaticAssert< - ( 1 == Arg1IsLayout + Arg1IsSpace + Arg1IsMemoryTraits + Arg1IsVoid ) - , Arg1 >::type Arg1Verified ; - - // If Arg1 is Layout then Arg2 is Space, MemoryTraits, or void - // If Arg1 is Space then Arg2 is MemoryTraits or void - // If Arg1 is MemoryTraits then Arg2 is void - // If Arg1 is Void then Arg2 is void + std::conditional< ! std::is_same< typename prop::array_layout , void >::value + , typename prop::array_layout + , typename ExecutionSpace::array_layout + >::type + ArrayLayout ; + typedef typename - Impl::StaticAssert< - ( Arg1IsLayout && ( 1 == Arg2IsSpace + Arg2IsMemoryTraits + Arg2IsVoid ) ) || - ( Arg1IsSpace && ( 0 == Arg2IsSpace ) && ( 1 == Arg2IsMemoryTraits + Arg2IsVoid ) ) || - ( Arg1IsMemoryTraits && Arg2IsVoid ) || - ( Arg1IsVoid && Arg2IsVoid ) - , Arg2 >::type Arg2Verified ; - - // Arg3 is MemoryTraits or void and at most one argument is MemoryTraits + std::conditional + < ! std::is_same< typename prop::HostMirrorSpace , void >::value + , typename prop::HostMirrorSpace + , typename Kokkos::Impl::is_space< ExecutionSpace >::host_mirror_space + >::type + HostMirrorSpace ; + typedef typename - Impl::StaticAssert< - ( 1 == Arg3IsMemoryTraits + Arg3IsVoid ) && - ( Arg1IsMemoryTraits + Arg2IsMemoryTraits + Arg3IsMemoryTraits <= 1 ) - , Arg3 >::type Arg3Verified ; - - // Arg1 or Arg2 may have execution and memory spaces - typedef typename Impl::if_c<( Arg1IsSpace ), Arg1Verified , - typename Impl::if_c<( Arg2IsSpace ), Arg2Verified , - Kokkos::DefaultExecutionSpace - >::type >::type::execution_space ExecutionSpace ; - - typedef typename Impl::if_c<( Arg1IsSpace ), Arg1Verified , - typename Impl::if_c<( Arg2IsSpace ), Arg2Verified , - Kokkos::DefaultExecutionSpace - >::type >::type::memory_space MemorySpace ; - - typedef typename Impl::is_space< - typename Impl::if_c<( Arg1IsSpace ), Arg1Verified , - typename Impl::if_c<( Arg2IsSpace ), Arg2Verified , - Kokkos::DefaultExecutionSpace - >::type >::type >::host_mirror_space HostMirrorSpace ; - - // Arg1 may be array layout - typedef typename Impl::if_c< Arg1IsLayout , Arg1Verified , - typename ExecutionSpace::array_layout - >::type ArrayLayout ; - - // Arg1, Arg2, or Arg3 may be memory traits - typedef typename Impl::if_c< Arg1IsMemoryTraits , Arg1Verified , - typename Impl::if_c< Arg2IsMemoryTraits , Arg2Verified , - typename Impl::if_c< Arg3IsMemoryTraits , Arg3Verified , - MemoryManaged - >::type >::type >::type MemoryTraits ; - - typedef Impl::AnalyzeShape<DataType> analysis ; + std::conditional< ! std::is_same< typename prop::memory_traits , void >::value + , typename prop::memory_traits + , typename Kokkos::MemoryManaged + >::type + MemoryTraits ; + + // Analyze data type's properties, + // May be specialized based upon the layout and value type + typedef Kokkos::Experimental::Impl::ViewDataAnalysis< DataType , ArrayLayout > data_analysis ; public: //------------------------------------ // Data type traits: - typedef DataType data_type ; - typedef typename analysis::const_type const_data_type ; - typedef typename analysis::non_const_type non_const_data_type ; + typedef typename data_analysis::type data_type ; + typedef typename data_analysis::const_type const_data_type ; + typedef typename data_analysis::non_const_type non_const_data_type ; //------------------------------------ - // Array of intrinsic scalar type traits: + // Compatible array of trivial type traits: - typedef typename analysis::array_intrinsic_type array_intrinsic_type ; - typedef typename analysis::const_array_intrinsic_type const_array_intrinsic_type ; - typedef typename analysis::non_const_array_intrinsic_type non_const_array_intrinsic_type ; + typedef typename data_analysis::scalar_array_type scalar_array_type ; + typedef typename data_analysis::const_scalar_array_type const_scalar_array_type ; + typedef typename data_analysis::non_const_scalar_array_type non_const_scalar_array_type ; //------------------------------------ // Value type traits: - typedef typename analysis::value_type value_type ; - typedef typename analysis::const_value_type const_value_type ; - typedef typename analysis::non_const_value_type non_const_value_type ; + typedef typename data_analysis::value_type value_type ; + typedef typename data_analysis::const_value_type const_value_type ; + typedef typename data_analysis::non_const_value_type non_const_value_type ; //------------------------------------ - // Layout and shape traits: + // Mapping traits: - typedef ArrayLayout array_layout ; - typedef typename analysis::shape shape_type ; + typedef ArrayLayout array_layout ; + typedef typename data_analysis::dimension dimension ; + typedef typename data_analysis::specialize specialize /* mapping specialization tag */ ; - enum { rank = shape_type::rank }; - enum { rank_dynamic = shape_type::rank_dynamic }; + enum { rank = dimension::rank }; + enum { rank_dynamic = dimension::rank_dynamic }; //------------------------------------ // Execution space, memory space, memory access traits, and host mirror space. - typedef ExecutionSpace execution_space ; - typedef MemorySpace memory_space ; - typedef Device<ExecutionSpace,MemorySpace> device_type ; - typedef MemoryTraits memory_traits ; - typedef HostMirrorSpace host_mirror_space ; + typedef ExecutionSpace execution_space ; + typedef MemorySpace memory_space ; + typedef Kokkos::Device<ExecutionSpace,MemorySpace> device_type ; + typedef MemoryTraits memory_traits ; + typedef HostMirrorSpace host_mirror_space ; - typedef typename memory_space::size_type size_type ; + typedef typename MemorySpace::size_type size_type ; - enum { is_hostspace = Impl::is_same< memory_space , HostSpace >::value }; - enum { is_managed = memory_traits::Unmanaged == 0 }; - enum { is_random_access = memory_traits::RandomAccess == 1 }; + enum { is_hostspace = std::is_same< MemorySpace , HostSpace >::value }; + enum { is_managed = MemoryTraits::Unmanaged == 0 }; + enum { is_random_access = MemoryTraits::RandomAccess == 1 }; //------------------------------------ - - - //------------------------------------ - // Specialization tag: - - typedef typename - Impl::ViewSpecialize< value_type - , typename analysis::specialize - , array_layout - , memory_space - , memory_traits - >::type specialize ; -}; - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -class ViewDefault {}; - -/** \brief Default view specialization has LayoutLeft, LayoutRight, or LayoutStride. - */ -template< class ValueType , class MemorySpace , class MemoryTraits > -struct ViewSpecialize< ValueType , void , LayoutLeft , MemorySpace , MemoryTraits > -{ typedef ViewDefault type ; }; - -template< class ValueType , class MemorySpace , class MemoryTraits > -struct ViewSpecialize< ValueType , void , LayoutRight , MemorySpace , MemoryTraits > -{ typedef ViewDefault type ; }; - -template< class ValueType , class MemorySpace , class MemoryTraits > -struct ViewSpecialize< ValueType , void , LayoutStride , MemorySpace , MemoryTraits > -{ typedef ViewDefault type ; }; - -} /* namespace Impl */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -/** \brief Types for compile-time detection of View usage errors */ -namespace ViewError { - -struct allocation_constructor_requires_managed {}; -struct allocation_constructor_requires_nonconst {}; -struct user_pointer_constructor_requires_unmanaged {}; -struct device_shmem_constructor_requires_unmanaged {}; - -struct scalar_operator_called_from_non_scalar_view {}; - -} /* namespace ViewError */ - -//---------------------------------------------------------------------------- -/** \brief Enable view parentheses operator for - * match of layout and integral arguments. - * If correct rank define type from traits, - * otherwise define type as an error message. - */ -template< class ReturnType , class Traits , class Layout , unsigned Rank , - typename iType0 = int , typename iType1 = int , - typename iType2 = int , typename iType3 = int , - typename iType4 = int , typename iType5 = int , - typename iType6 = int , typename iType7 = int , - class Enable = void > -struct ViewEnableArrayOper ; - -template< class ReturnType , class Traits , class Layout , unsigned Rank , - typename iType0 , typename iType1 , - typename iType2 , typename iType3 , - typename iType4 , typename iType5 , - typename iType6 , typename iType7 > -struct ViewEnableArrayOper< - ReturnType , Traits , Layout , Rank , - iType0 , iType1 , iType2 , iType3 , - iType4 , iType5 , iType6 , iType7 , - typename enable_if< - iType0(0) == 0 && iType1(0) == 0 && iType2(0) == 0 && iType3(0) == 0 && - iType4(0) == 0 && iType5(0) == 0 && iType6(0) == 0 && iType7(0) == 0 && - is_same< typename Traits::array_layout , Layout >::value && - ( unsigned(Traits::rank) == Rank ) - >::type > -{ - typedef ReturnType type ; }; -} /* namespace Impl */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - /** \class View * \brief View to an array of data. * @@ -376,11 +310,13 @@ namespace Kokkos { * they may occur. * * Valid ways in which template arguments may be specified: - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , Space , void , MemoryTraits > + * - View< DataType > + * - View< DataType , Layout > * - View< DataType , Layout , Space > * - View< DataType , Layout , Space , MemoryTraits > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , MemoryTraits > * * \tparam DataType (required) This indicates both the type of each * entry of the array, and the combination of compile-time and @@ -437,1194 +373,1425 @@ namespace Kokkos { * } * \endcode */ -template< class DataType , - class Arg1Type = void , /* ArrayLayout, SpaceType, or MemoryTraits */ - class Arg2Type = void , /* SpaceType or MemoryTraits */ - class Arg3Type = void , /* MemoryTraits */ - class Specialize = - typename ViewTraits<DataType,Arg1Type,Arg2Type,Arg3Type>::specialize > +template< class DataType , class ... Properties > class View ; -template< class C > -struct is_view : public Impl::bool_< false > {}; - -template< class D , class A1 , class A2 , class A3 , class S > -struct is_view< View< D , A1 , A2 , A3 , S > > : public Impl::bool_< true > {}; - -namespace Impl { -using Kokkos::is_view ; -} +} /* namespace Experimental */ +} /* namespace Kokkos */ +//---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -template< class DataType , - class Arg1Type , - class Arg2Type , - class Arg3Type > -class View< DataType , Arg1Type , Arg2Type , Arg3Type , Impl::ViewDefault > - : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > -{ -public: +#include <impl/KokkosExp_ViewMapping.hpp> +#include <impl/KokkosExp_ViewArray.hpp> - typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ; +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- -private: +namespace Kokkos { +namespace Experimental { - // Assignment of compatible views requirement: - template< class , class , class , class , class > friend class View ; +namespace { - // Assignment of compatible subview requirement: - template< class , class , class > friend struct Impl::ViewAssignment ; +constexpr Kokkos::Experimental::Impl::ALL_t + ALL = Kokkos::Experimental::Impl::ALL_t(); - // Dimensions, cardinality, capacity, and offset computation for - // multidimensional array view of contiguous memory. - // Inherits from Impl::Shape - typedef Impl::ViewOffset< typename traits::shape_type - , typename traits::array_layout - > offset_map_type ; +constexpr Kokkos::Experimental::Impl::WithoutInitializing_t + WithoutInitializing = Kokkos::Experimental::Impl::WithoutInitializing_t(); - // Intermediary class for data management and access - typedef Impl::ViewDataManagement< traits > view_data_management ; +constexpr Kokkos::Experimental::Impl::AllowPadding_t + AllowPadding = Kokkos::Experimental::Impl::AllowPadding_t(); - //---------------------------------------- - // Data members: +} - typename view_data_management::handle_type m_ptr_on_device ; - offset_map_type m_offset_map ; - view_data_management m_management ; - Impl::AllocationTracker m_tracker ; +/** \brief Create View allocation parameter bundle from argument list. + * + * Valid argument list members are: + * 1) label as a "string" or std::string + * 2) memory space instance of the View::memory_space type + * 3) execution space instance compatible with the View::memory_space + * 4) Kokkos::WithoutInitializing to bypass initialization + * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory alignment + */ +template< class ... Args > +inline +Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... > +view_alloc( Args const & ... args ) +{ + typedef + Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... > + return_type ; - //---------------------------------------- + static_assert( ! return_type::has_pointer + , "Cannot give pointer-to-memory for view allocation" ); -public: + return return_type( args... ); +} - /** return type for all indexing operators */ - typedef typename view_data_management::return_type reference_type ; +template< class ... Args > +inline +Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... > +view_wrap( Args const & ... args ) +{ + typedef + Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... > + return_type ; - enum { reference_type_is_lvalue = view_data_management::ReturnTypeIsReference }; + static_assert( ! return_type::has_memory_space && + ! return_type::has_execution_space && + ! return_type::has_label && + return_type::has_pointer + , "Must only give pointer-to-memory for view wrapping" ); - typedef View< typename traits::array_intrinsic_type , - typename traits::array_layout , - typename traits::device_type , - typename traits::memory_traits > array_type ; + return return_type( args... ); +} - typedef View< typename traits::const_data_type , - typename traits::array_layout , - typename traits::device_type , - typename traits::memory_traits > const_type ; +} /* namespace Experimental */ +} /* namespace Kokkos */ - typedef View< typename traits::non_const_data_type , - typename traits::array_layout , - typename traits::device_type , - typename traits::memory_traits > non_const_type ; +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- - typedef View< typename traits::non_const_data_type , - typename traits::array_layout , - typename traits::host_mirror_space , - void > HostMirror ; +namespace Kokkos { +namespace Experimental { - //------------------------------------ - // Shape +template< class DataType , class ... Properties > +class View ; - enum { Rank = traits::rank }; +template< class > struct is_view : public std::false_type {}; - KOKKOS_INLINE_FUNCTION offset_map_type shape() const { return m_offset_map ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_offset_map.N0 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type size() const { return m_offset_map.cardinality(); } +template< class D, class ... P > +struct is_view< View<D,P...> > : public std::true_type {}; - template< typename iType > - KOKKOS_INLINE_FUNCTION - typename traits::size_type dimension( const iType & i ) const - { return Impl::dimension( m_offset_map , i ); } +template< class D, class ... P > +struct is_view< const View<D,P...> > : public std::true_type {}; - //------------------------------------ - // Destructor, constructors, assignment operators: +template< class DataType , class ... Properties > +class View : public ViewTraits< DataType , Properties ... > { +private: - KOKKOS_INLINE_FUNCTION - ~View() {} + template< class , class ... > friend class View ; + template< class , class ... > friend class Impl::ViewMapping ; - KOKKOS_INLINE_FUNCTION - View() - : m_ptr_on_device() - , m_offset_map() - , m_management() - , m_tracker() - { m_offset_map.assign(0, 0,0,0,0,0,0,0,0); } +public: - KOKKOS_INLINE_FUNCTION - View( const View & rhs ) - : m_ptr_on_device() - , m_offset_map() - , m_management() - , m_tracker() - { - (void) Impl::ViewAssignment< - typename traits::specialize , - typename traits::specialize >( *this , rhs ); - } + typedef ViewTraits< DataType , Properties ... > traits ; - KOKKOS_INLINE_FUNCTION - View & operator = ( const View & rhs ) - { - (void) Impl::ViewAssignment< - typename traits::specialize , - typename traits::specialize >( *this , rhs ); - return *this ; - } +private: - //------------------------------------ - // Construct or assign compatible view: + typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ; + typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ; - template< class RT , class RL , class RD , class RM , class RS > - KOKKOS_INLINE_FUNCTION - View( const View<RT,RL,RD,RM,RS> & rhs ) - : m_ptr_on_device() - , m_offset_map() - , m_management() - , m_tracker() - { - (void) Impl::ViewAssignment< - typename traits::specialize , RS >( *this , rhs ); - } + track_type m_track ; + map_type m_map ; - template< class RT , class RL , class RD , class RM , class RS > - KOKKOS_INLINE_FUNCTION - View & operator = ( const View<RT,RL,RD,RM,RS> & rhs ) - { - (void) Impl::ViewAssignment< - typename traits::specialize , RS >( *this , rhs ); - return *this ; - } +public: - //------------------------------------ - /**\brief Allocation of a managed view with possible alignment padding. - * - * Allocation properties for allocating and initializing to the default value_type: - * Kokkos::ViewAllocate() - * Kokkos::ViewAllocate("label") OR "label" - * Kokkos::ViewAllocate(std::string("label")) OR std::string("label") - * - * Allocation properties for allocating and bypassing initialization: - * Kokkos::ViewAllocateWithoutInitializing() - * Kokkos::ViewAllocateWithoutInitializing("label") - */ + //---------------------------------------- + /** \brief Compatible view of array of scalar types */ + typedef View< typename traits::scalar_array_type , + typename traits::array_layout , + typename traits::device_type , + typename traits::memory_traits > + array_type ; - template< class AllocationProperties > - explicit inline - View( const AllocationProperties & prop , - // Impl::ViewAllocProp::size_type exists when the traits and allocation properties - // are valid for allocating viewed memory. - const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type n0 = 0 , - const size_t n1 = 0 , - const size_t n2 = 0 , - const size_t n3 = 0 , - const size_t n4 = 0 , - const size_t n5 = 0 , - const size_t n6 = 0 , - const size_t n7 = 0 , - const size_t n8 = 0 ) - : m_ptr_on_device() - , m_offset_map() - , m_management() - , m_tracker() - { - typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ; + /** \brief Compatible view of const data type */ + typedef View< typename traits::const_data_type , + typename traits::array_layout , + typename traits::device_type , + typename traits::memory_traits > + const_type ; - static_assert(!std::is_same<typename traits::array_layout, LayoutStride>::value, - "LayoutStride does not support View constructor which takes dimensions directly!"); + /** \brief Compatible view of non-const data type */ + typedef View< typename traits::non_const_data_type , + typename traits::array_layout , + typename traits::device_type , + typename traits::memory_traits > + non_const_type ; - m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 ); - if(Alloc::AllowPadding) - m_offset_map.set_padding(); + /** \brief Compatible HostMirror view */ + typedef View< typename traits::non_const_data_type , + typename traits::array_layout , + typename traits::host_mirror_space > + HostMirror ; - m_ptr_on_device = view_data_management::template allocate< Alloc::Initialize >( Alloc::label(prop) , m_offset_map, m_tracker ); + //---------------------------------------- + // Domain rank and extents - } + enum { Rank = map_type::Rank }; - template< class AllocationProperties > - explicit inline - View( const AllocationProperties & prop , - const typename traits::array_layout & layout , - // Impl::ViewAllocProp::size_type exists when the traits and allocation properties - // are valid for allocating viewed memory. - const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type = 0 ) - : m_ptr_on_device() - , m_offset_map() - , m_management() - , m_tracker() - { - typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ; + /** \brief rank() to be implemented + */ + //KOKKOS_INLINE_FUNCTION + //static + //constexpr unsigned rank() { return map_type::Rank; } - m_offset_map.assign( layout ); - if(Alloc::AllowPadding) - m_offset_map.set_padding(); + template< typename iType > + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if< std::is_integral<iType>::value , size_t >::type + extent( const iType & r ) const + { return m_map.extent(r); } - m_ptr_on_device = view_data_management::template allocate< Alloc::Initialize >( Alloc::label(prop) , m_offset_map, m_tracker ); + template< typename iType > + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if< std::is_integral<iType>::value , int >::type + extent_int( const iType & r ) const + { return static_cast<int>(m_map.extent(r)); } - m_management.set_noncontiguous(); - } + KOKKOS_INLINE_FUNCTION constexpr + typename traits::array_layout layout() const + { return m_map.layout(); } - //------------------------------------ - // Assign an unmanaged View from pointer, can be called in functors. - // No alignment padding is performed. + //---------------------------------------- + /* Deprecate all 'dimension' functions in favor of + * ISO/C++ vocabulary 'extent'. + */ - template< class Type > - explicit KOKKOS_INLINE_FUNCTION - View( Type * ptr , - typename Impl::ViewRawPointerProp< traits , Type >::size_type n0 = 0 , - const size_t n1 = 0 , - const size_t n2 = 0 , - const size_t n3 = 0 , - const size_t n4 = 0 , - const size_t n5 = 0 , - const size_t n6 = 0 , - const size_t n7 = 0 , - const size_t n8 = 0 ) - : m_ptr_on_device(ptr) - , m_offset_map() - , m_management() - , m_tracker() - { - m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 ); - m_management.set_unmanaged(); - } + template< typename iType > + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if< std::is_integral<iType>::value , size_t >::type + dimension( const iType & r ) const { return extent( r ); } + + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); } - template< class Type > - explicit KOKKOS_INLINE_FUNCTION - View( Type * ptr , - typename traits::array_layout const & layout , - typename Impl::ViewRawPointerProp< traits , Type >::size_type = 0 ) - : m_ptr_on_device(ptr) - , m_offset_map() - , m_management() - , m_tracker() - { - m_offset_map.assign( layout ); - m_management.set_unmanaged(); - m_management.set_noncontiguous(); - } + //---------------------------------------- + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return m_map.dimension_0() * + m_map.dimension_1() * + m_map.dimension_2() * + m_map.dimension_3() * + m_map.dimension_4() * + m_map.dimension_5() * + m_map.dimension_6() * + m_map.dimension_7(); } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); } + template< typename iType > + KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_map.stride(s); } - //------------------------------------ - // Assign a View from an AllocationTracker, - // The allocator used must be compatiable with the memory space of the view - // No alignment padding is performed. - // TODO: Should these allow padding??? DJS 01/15/15 - explicit - View( Impl::AllocationTracker const &arg_tracker , - const size_t n0 = 0 , - const size_t n1 = 0 , - const size_t n2 = 0 , - const size_t n3 = 0 , - const size_t n4 = 0 , - const size_t n5 = 0 , - const size_t n6 = 0 , - const size_t n7 = 0 , - const size_t n8 = 0 ) - : m_ptr_on_device(reinterpret_cast<typename traits::value_type*>(arg_tracker.alloc_ptr())) - , m_offset_map() - , m_management() - , m_tracker(arg_tracker) - { - m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 ); + //---------------------------------------- + // Range span is the span which contains all members. - const size_t req_size = m_offset_map.capacity() * sizeof(typename traits::value_type); - if ( m_tracker.alloc_size() < req_size ) { - Impl::throw_runtime_exception("Error: tracker.alloc_size() < req_size"); - } - } + typedef typename map_type::reference_type reference_type ; + typedef typename map_type::pointer_type pointer_type ; - explicit - View( Impl::AllocationTracker const & arg_tracker - , typename traits::array_layout const & layout ) - : m_ptr_on_device(reinterpret_cast<typename traits::value_type*>(arg_tracker.alloc_ptr())) - , m_offset_map() - , m_management() - , m_tracker(arg_tracker) - { - m_offset_map.assign( layout ); + enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value }; - const size_t req_size = m_offset_map.capacity() * sizeof(typename traits::value_type); - if ( m_tracker.alloc_size() < req_size ) { - Impl::throw_runtime_exception("Error: tracker.alloc_size() < req_size"); - } + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + // Deprecated, use 'span()' instead + KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_map.span_is_contiguous(); } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return m_map.data(); } - m_management.set_noncontiguous(); - } + // Deprecated, use 'span_is_contigous()' instead + KOKKOS_INLINE_FUNCTION constexpr bool is_contiguous() const { return m_map.span_is_contiguous(); } + // Deprecated, use 'data()' instead + KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const { return m_map.data(); } - //------------------------------------ - /** \brief Constructors for subviews requires following - * type-compatibility condition, enforce via StaticAssert. - * - * Impl::is_same< View , - * typename Impl::ViewSubview< View<D,A1,A2,A3,Impl::ViewDefault> - * , ArgType0 , ArgType1 , ArgType2 , ArgType3 - * , ArgType4 , ArgType5 , ArgType6 , ArgType7 - * >::type >::value - */ - template< class D , class A1 , class A2 , class A3 - , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type - , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type - > - KOKKOS_INLINE_FUNCTION - View( const View<D,A1,A2,A3,Impl::ViewDefault> & src - , const SubArg0_type & arg0 , const SubArg1_type & arg1 - , const SubArg2_type & arg2 , const SubArg3_type & arg3 - , const SubArg4_type & arg4 , const SubArg5_type & arg5 - , const SubArg6_type & arg6 , const SubArg7_type & arg7 - ); - - template< class D , class A1 , class A2 , class A3 - , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type - , class SubArg4_type , class SubArg5_type , class SubArg6_type - > - KOKKOS_INLINE_FUNCTION - View( const View<D,A1,A2,A3,Impl::ViewDefault> & src - , const SubArg0_type & arg0 , const SubArg1_type & arg1 - , const SubArg2_type & arg2 , const SubArg3_type & arg3 - , const SubArg4_type & arg4 , const SubArg5_type & arg5 - , const SubArg6_type & arg6 - ); - - template< class D , class A1 , class A2 , class A3 - , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type - , class SubArg4_type , class SubArg5_type - > - KOKKOS_INLINE_FUNCTION - View( const View<D,A1,A2,A3,Impl::ViewDefault> & src - , const SubArg0_type & arg0 , const SubArg1_type & arg1 - , const SubArg2_type & arg2 , const SubArg3_type & arg3 - , const SubArg4_type & arg4 , const SubArg5_type & arg5 - ); - - template< class D , class A1 , class A2 , class A3 - , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type - , class SubArg4_type - > - KOKKOS_INLINE_FUNCTION - View( const View<D,A1,A2,A3,Impl::ViewDefault> & src - , const SubArg0_type & arg0 , const SubArg1_type & arg1 - , const SubArg2_type & arg2 , const SubArg3_type & arg3 - , const SubArg4_type & arg4 - ); - - template< class D , class A1 , class A2 , class A3 - , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type - > - KOKKOS_INLINE_FUNCTION - View( const View<D,A1,A2,A3,Impl::ViewDefault> & src - , const SubArg0_type & arg0 , const SubArg1_type & arg1 - , const SubArg2_type & arg2 , const SubArg3_type & arg3 - ); - - template< class D , class A1 , class A2 , class A3 - , class SubArg0_type , class SubArg1_type , class SubArg2_type - > - KOKKOS_INLINE_FUNCTION - View( const View<D,A1,A2,A3,Impl::ViewDefault> & src - , const SubArg0_type & arg0 , const SubArg1_type & arg1 - , const SubArg2_type & arg2 - ); - - template< class D , class A1 , class A2 , class A3 - , class SubArg0_type , class SubArg1_type - > - KOKKOS_INLINE_FUNCTION - View( const View<D,A1,A2,A3,Impl::ViewDefault> & src - , const SubArg0_type & arg0 , const SubArg1_type & arg1 - ); + //---------------------------------------- + // Allow specializations to query their specialized map - template< class D , class A1 , class A2 , class A3 - , class SubArg0_type - > KOKKOS_INLINE_FUNCTION - View( const View<D,A1,A2,A3,Impl::ViewDefault> & src - , const SubArg0_type & arg0 - ); - - //------------------------------------ - // Assign unmanaged View to portion of execution space's shared memory - - typedef Impl::if_c< ! traits::is_managed , - const typename traits::execution_space::scratch_memory_space & , - Impl::ViewError::device_shmem_constructor_requires_unmanaged > - if_scratch_memory_constructor ; - - explicit KOKKOS_INLINE_FUNCTION - View( typename if_scratch_memory_constructor::type space , - const unsigned n0 = 0 , - const unsigned n1 = 0 , - const unsigned n2 = 0 , - const unsigned n3 = 0 , - const unsigned n4 = 0 , - const unsigned n5 = 0 , - const unsigned n6 = 0 , - const unsigned n7 = 0 ) - : m_ptr_on_device() - , m_offset_map() - , m_management() - , m_tracker() - { - typedef typename traits::value_type value_type_ ; - - enum { align = 8 }; - enum { mask = align - 1 }; - - m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); + const Kokkos::Experimental::Impl::ViewMapping< traits , void > & + implementation_map() const { return m_map ; } - typedef Impl::if_c< ! traits::is_managed , - value_type_ * , - Impl::ViewError::device_shmem_constructor_requires_unmanaged > - if_device_shmem_pointer ; + //---------------------------------------- - // Select the first argument: - m_ptr_on_device = if_device_shmem_pointer::select( - (value_type_*) space.get_shmem( unsigned( sizeof(value_type_) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) ); - } +private: - explicit KOKKOS_INLINE_FUNCTION - View( typename if_scratch_memory_constructor::type space , - typename traits::array_layout const & layout) - : m_ptr_on_device() - , m_offset_map() - , m_management() - , m_tracker() - { - typedef typename traits::value_type value_type_ ; + enum { + is_layout_left = std::is_same< typename traits::array_layout + , Kokkos::LayoutLeft >::value , - typedef Impl::if_c< ! traits::is_managed , - value_type_ * , - Impl::ViewError::device_shmem_constructor_requires_unmanaged > - if_device_shmem_pointer ; + is_layout_right = std::is_same< typename traits::array_layout + , Kokkos::LayoutRight >::value , - m_offset_map.assign( layout ); - m_management.set_unmanaged(); - m_management.set_noncontiguous(); + is_layout_stride = std::is_same< typename traits::array_layout + , Kokkos::LayoutStride >::value , - enum { align = 8 }; - enum { mask = align - 1 }; + is_default_map = + std::is_same< typename traits::specialize , void >::value && + ( is_layout_left || is_layout_right || is_layout_stride ) + }; - // Select the first argument: - m_ptr_on_device = if_device_shmem_pointer::select( - (value_type_*) space.get_shmem( unsigned( sizeof(value_type_) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) ); - } +#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) - static inline - unsigned shmem_size( const unsigned n0 = 0 , - const unsigned n1 = 0 , - const unsigned n2 = 0 , - const unsigned n3 = 0 , - const unsigned n4 = 0 , - const unsigned n5 = 0 , - const unsigned n6 = 0 , - const unsigned n7 = 0 ) - { - enum { align = 8 }; - enum { mask = align - 1 }; +#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \ + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \ + < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \ + Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ; - typedef typename traits::value_type value_type_ ; +#else - offset_map_type offset_map ; +#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \ + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \ + < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); - offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); +#endif - return unsigned( sizeof(value_type_) * offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ; - } +public: - //------------------------------------ - // Is not allocated + //------------------------------ + // Rank 0 operator() + template< class ... Args > KOKKOS_FORCEINLINE_FUNCTION - bool is_null() const { return 0 == ptr_on_device() ; } - - //------------------------------------ - // Operators for scalar (rank zero) views. - - typedef Impl::if_c< traits::rank == 0 , - typename traits::value_type , - Impl::ViewError::scalar_operator_called_from_non_scalar_view > - if_scalar_operator ; - - typedef Impl::if_c< traits::rank == 0 , - reference_type , - Impl::ViewError::scalar_operator_called_from_non_scalar_view > - if_scalar_operator_return ; - KOKKOS_INLINE_FUNCTION - const View & operator = ( const typename if_scalar_operator::type & rhs ) const + typename std::enable_if<( Kokkos::Impl::are_integral<Args...>::value + && ( 0 == Rank ) + ), reference_type >::type + operator()( Args ... args ) const { - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); - m_ptr_on_device[ 0 ] = if_scalar_operator::select( rhs ); - return *this ; - } + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,args...) ) - KOKKOS_FORCEINLINE_FUNCTION - operator typename if_scalar_operator_return::type () const - { - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); - return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] ); + return m_map.reference(); } - KOKKOS_FORCEINLINE_FUNCTION - typename if_scalar_operator_return::type operator()() const - { - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); - return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] ); - } + //------------------------------ + // Rank 1 operator() + template< typename I0 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename if_scalar_operator_return::type operator*() const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,Args...>::value + && ( 1 == Rank ) + && ! is_default_map + ), reference_type >::type + operator()( const I0 & i0 + , Args ... args ) const { - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); - return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] ); - } + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) ) - //------------------------------------ - // Array member access operators enabled if - // (1) a zero value of all argument types are compile-time comparable to zero - // (2) the rank matches the number of arguments - // (3) the memory space is valid for the access - //------------------------------------ - // rank 1: - // Specialisation for LayoutLeft and LayoutRight since we know its stride 1 + return m_map.reference(i0); + } - template< typename iType0 > + template< typename I0 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutLeft, 1, iType0 >::type - operator[] ( const iType0 & i0 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,Args...>::value + && ( 1 == Rank ) + && is_default_map + && ! is_layout_stride + ), reference_type >::type + operator()( const I0 & i0 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) ) - return m_ptr_on_device[ i0 ]; + return m_map.m_handle[ i0 ]; } - template< typename iType0 > + template< typename I0 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutLeft, 1, iType0 >::type - operator() ( const iType0 & i0 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,Args...>::value + && ( 1 == Rank ) + && is_default_map + && is_layout_stride + ), reference_type >::type + operator()( const I0 & i0 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) ) - return m_ptr_on_device[ i0 ]; + return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ]; } - template< typename iType0 > + //------------------------------ + // Rank 1 operator[] + + template< typename I0 > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutLeft, 1, iType0 >::type - at( const iType0 & i0 , const int , const int , const int , - const int , const int , const int , const int ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0>::value + && ( 1 == Rank ) + && ! is_default_map + ), reference_type >::type + operator[]( const I0 & i0 ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) ) - return m_ptr_on_device[ i0 ]; + return m_map.reference(i0); } - template< typename iType0 > + template< typename I0 > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutRight, 1, iType0 >::type - operator[] ( const iType0 & i0 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0>::value + && ( 1 == Rank ) + && is_default_map + && ! is_layout_stride + ), reference_type >::type + operator[]( const I0 & i0 ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) ) - return m_ptr_on_device[ i0 ]; + return m_map.m_handle[ i0 ]; } - template< typename iType0 > + template< typename I0 > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutRight, 1, iType0 >::type - operator() ( const iType0 & i0 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0>::value + && ( 1 == Rank ) + && is_default_map + && is_layout_stride + ), reference_type >::type + operator[]( const I0 & i0 ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) ) - return m_ptr_on_device[ i0 ]; + return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ]; } - template< typename iType0 > + //------------------------------ + // Rank 2 + + template< typename I0 , typename I1 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutRight, 1, iType0 >::type - at( const iType0 & i0 , const int , const int , const int , - const int , const int , const int , const int ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,Args...>::value + && ( 2 == Rank ) + && ! is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - return m_ptr_on_device[ i0 ]; + return m_map.reference(i0,i1); } - template< typename iType0 > + template< typename I0 , typename I1 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , traits, - typename Impl::if_c< - Impl::is_same<typename traits::array_layout, LayoutRight>::value || - Impl::is_same<typename traits::array_layout, LayoutLeft>::value , - void, typename traits::array_layout>::type, - 1, iType0 >::type - operator[] ( const iType0 & i0 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,Args...>::value + && ( 2 == Rank ) + && is_default_map + && is_layout_left && ( traits::rank_dynamic == 0 ) + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - return m_ptr_on_device[ m_offset_map(i0) ]; + return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ]; } - template< typename iType0 > + template< typename I0 , typename I1 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , traits, - typename Impl::if_c< - Impl::is_same<typename traits::array_layout, LayoutRight>::value || - Impl::is_same<typename traits::array_layout, LayoutLeft>::value , - void, typename traits::array_layout>::type, - 1, iType0 >::type - operator() ( const iType0 & i0 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,Args...>::value + && ( 2 == Rank ) + && is_default_map + && is_layout_left && ( traits::rank_dynamic != 0 ) + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - return m_ptr_on_device[ m_offset_map(i0) ]; + return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ]; } - template< typename iType0 > + template< typename I0 , typename I1 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , traits, - typename Impl::if_c< - Impl::is_same<typename traits::array_layout, LayoutRight>::value || - Impl::is_same<typename traits::array_layout, LayoutLeft>::value , - void, typename traits::array_layout>::type, - 1, iType0 >::type - at( const iType0 & i0 , const int , const int , const int , - const int , const int , const int , const int ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,Args...>::value + && ( 2 == Rank ) + && is_default_map + && is_layout_right && ( traits::rank_dynamic == 0 ) + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - return m_ptr_on_device[ m_offset_map(i0) ]; + return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ]; } - // rank 2: - - template< typename iType0 , typename iType1 > + template< typename I0 , typename I1 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 2, iType0, iType1 >::type - operator() ( const iType0 & i0 , const iType1 & i1 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,Args...>::value + && ( 2 == Rank ) + && is_default_map + && is_layout_right && ( traits::rank_dynamic != 0 ) + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1) ]; + return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ]; } - template< typename iType0 , typename iType1 > + template< typename I0 , typename I1 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 2, iType0, iType1 >::type - at( const iType0 & i0 , const iType1 & i1 , const int , const int , - const int , const int , const int , const int ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,Args...>::value + && ( 2 == Rank ) + && is_default_map + && is_layout_stride + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1) ]; + return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 + + i1 * m_map.m_offset.m_stride.S1 ]; } - // rank 3: + //------------------------------ + // Rank 3 - template< typename iType0 , typename iType1 , typename iType2 > + template< typename I0 , typename I1 , typename I2 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value + && ( 3 == Rank ) + && is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2) ]; + return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ]; } - template< typename iType0 , typename iType1 , typename iType2 > + template< typename I0 , typename I1 , typename I2 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type - at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const int , - const int , const int , const int , const int ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value + && ( 3 == Rank ) + && ! is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2) ]; + return m_map.reference(i0,i1,i2); } - // rank 4: + //------------------------------ + // Rank 4 - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 > + template< typename I0 , typename I1 , typename I2 , typename I3 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value + && ( 4 == Rank ) + && is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3) ]; + return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ]; } - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 > + template< typename I0 , typename I1 , typename I2 , typename I3 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type - at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const int , const int , const int , const int ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value + && ( 4 == Rank ) + && ! is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3) ]; + return m_map.reference(i0,i1,i2,i3); } - // rank 5: + //------------------------------ + // Rank 5 - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 > + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3 , iType4 >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value + && ( 5 == Rank ) + && is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4) ]; + return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ]; } - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 > + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3 , iType4 >::type - at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 , const int , const int , const int ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value + && ( 5 == Rank ) + && ! is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4) ]; + return m_map.reference(i0,i1,i2,i3,i4); } - // rank 6: + //------------------------------ + // Rank 6 - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 , typename iType5 > + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 6, - iType0, iType1, iType2, iType3 , iType4, iType5 >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 , const iType5 & i5 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value + && ( 6 == Rank ) + && is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5) ]; + return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ]; } - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 , typename iType5 > + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 6, - iType0, iType1, iType2, iType3 , iType4, iType5 >::type - at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 , const iType5 & i5 , const int , const int ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value + && ( 6 == Rank ) + && ! is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5) ]; + return m_map.reference(i0,i1,i2,i3,i4,i5); } - // rank 7: + //------------------------------ + // Rank 7 - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 , typename iType5 , typename iType6 > + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 , typename I6 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 7, - iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value + && ( 7 == Rank ) + && is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 , const I6 & i6 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ]; + return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ]; } - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 , typename iType5 , typename iType6 > + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 , typename I6 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 7, - iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type - at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const int ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value + && ( 7 == Rank ) + && ! is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 , const I6 & i6 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ]; + return m_map.reference(i0,i1,i2,i3,i4,i5,i6); } - // rank 8: + //------------------------------ + // Rank 8 - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 , typename iType5 , typename iType6 , typename iType7 > + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 , typename I6 , typename I7 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 8, - iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value + && ( 8 == Rank ) + && is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ]; + return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ]; } - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 , typename iType5 , typename iType6 , typename iType7 > + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 , typename I6 , typename I7 + , class ... Args > KOKKOS_FORCEINLINE_FUNCTION - typename Impl::ViewEnableArrayOper< reference_type , - traits, typename traits::array_layout, 8, - iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type - at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const + typename std::enable_if< + ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value + && ( 8 == Rank ) + && ! is_default_map + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 + , Args ... args ) const { - KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) ) - return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ]; + return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7); } - //------------------------------------ - // Access to the underlying contiguous storage of this view specialization. - // These methods are specific to specialization of a view. +#undef KOKKOS_VIEW_OPERATOR_VERIFY - KOKKOS_FORCEINLINE_FUNCTION - typename traits::value_type * ptr_on_device() const - { return (typename traits::value_type *) m_ptr_on_device ; } + //---------------------------------------- + // Standard destructor, constructors, and assignment operators - // Stride of physical storage, dimensioned to at least Rank - template< typename iType > KOKKOS_INLINE_FUNCTION - void stride( iType * const s ) const - { m_offset_map.stride(s); } + ~View() {} - // Count of contiguously allocated data members including padding. KOKKOS_INLINE_FUNCTION - typename traits::size_type capacity() const - { return m_offset_map.capacity(); } + View() : m_track(), m_map() {} - // If the view data can be treated (deep copied) - // as a contiguous block of memory. KOKKOS_INLINE_FUNCTION - bool is_contiguous() const - { return m_management.is_contiguous(); } + View( const View & rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {} - const Impl::AllocationTracker & tracker() const { return m_tracker; } + KOKKOS_INLINE_FUNCTION + View( View && rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {} + + KOKKOS_INLINE_FUNCTION + View & operator = ( const View & rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; } + + KOKKOS_INLINE_FUNCTION + View & operator = ( View && rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; } + + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + + template< class RT , class ... RP > + KOKKOS_INLINE_FUNCTION + View( const View<RT,RP...> & rhs ) + : m_track( rhs.m_track , traits::is_managed ) + , m_map() + { + typedef typename View<RT,RP...>::traits SrcTraits ; + typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + static_assert( Mapping::is_assignable , "Incompatible View copy construction" ); + Mapping::assign( m_map , rhs.m_map , rhs.m_track ); + } + + template< class RT , class ... RP > + KOKKOS_INLINE_FUNCTION + View & operator = ( const View<RT,RP...> & rhs ) + { + typedef typename View<RT,RP...>::traits SrcTraits ; + typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + static_assert( Mapping::is_assignable , "Incompatible View copy assignment" ); + Mapping::assign( m_map , rhs.m_map , rhs.m_track ); + m_track.assign( rhs.m_track , traits::is_managed ); + return *this ; + } + + //---------------------------------------- + // Compatible subview constructor + // may assign unmanaged from managed. + + template< class RT , class ... RP , class Arg0 , class ... Args > + KOKKOS_INLINE_FUNCTION + View( const View< RT , RP... > & src_view + , const Arg0 & arg0 , Args ... args ) + : m_track( src_view.m_track , traits::is_managed ) + , m_map() + { + typedef View< RT , RP... > SrcType ; + + typedef Kokkos::Experimental::Impl::ViewMapping + < void /* deduce destination view type from source view traits */ + , typename SrcType::traits + , Arg0 , Args... > Mapping ; + + typedef typename Mapping::type DstType ; + + static_assert( Kokkos::Experimental::Impl::ViewMapping< traits , typename DstType::traits , void >::is_assignable + , "Subview construction requires compatible view and subview arguments" ); + + Mapping::assign( m_map, src_view.m_map, arg0 , args... ); + } + + //---------------------------------------- + // Allocation tracking properties + + KOKKOS_INLINE_FUNCTION + int use_count() const + { return m_track.use_count(); } + + inline + const std::string label() const + { return m_track.template get_label< typename traits::memory_space >(); } + + //---------------------------------------- + // Allocation according to allocation properties and array layout + + template< class ... P > + explicit inline + View( const Impl::ViewCtorProp< P ... > & arg_prop + , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer + , typename traits::array_layout + >::type const & arg_layout + ) + : m_track() + , m_map() + { + // Append layout and spaces if not input + typedef Impl::ViewCtorProp< P ... > alloc_prop_input ; + + // use 'std::integral_constant<unsigned,I>' for non-types + // to avoid duplicate class error. + typedef Impl::ViewCtorProp + < P ... + , typename std::conditional + < alloc_prop_input::has_label + , std::integral_constant<unsigned,0> + , typename std::string + >::type + , typename std::conditional + < alloc_prop_input::has_memory_space + , std::integral_constant<unsigned,1> + , typename traits::device_type::memory_space + >::type + , typename std::conditional + < alloc_prop_input::has_execution_space + , std::integral_constant<unsigned,2> + , typename traits::device_type::execution_space + >::type + > alloc_prop ; + + static_assert( traits::is_managed + , "View allocation constructor requires managed memory" ); + + if ( alloc_prop::initialize && + ! alloc_prop::execution_space::is_initialized() ) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception("Constructing View and initializing data with uninitialized execution space"); + } + + // Copy the input allocation properties with possibly defaulted properties + alloc_prop prop( arg_prop ); + +//------------------------------------------------------------ +#if defined( KOKKOS_HAVE_CUDA ) + // If allocating in CudaUVMSpace must fence before and after + // the allocation to protect against possible concurrent access + // on the CPU and the GPU. + // Fence using the trait's executon space (which will be Kokkos::Cuda) + // to avoid incomplete type errors from usng Kokkos::Cuda directly. + if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) { + traits::device_type::memory_space::execution_space::fence(); + } +#endif +//------------------------------------------------------------ + + Kokkos::Experimental::Impl::SharedAllocationRecord<> * + record = m_map.allocate_shared( prop , arg_layout ); + +//------------------------------------------------------------ +#if defined( KOKKOS_HAVE_CUDA ) + if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) { + traits::device_type::memory_space::execution_space::fence(); + } +#endif +//------------------------------------------------------------ + + // Setup and initialization complete, start tracking + m_track.assign_allocated_record_to_uninitialized( record ); + } + + // Wrap memory according to properties and array layout + template< class ... P > + explicit KOKKOS_INLINE_FUNCTION + View( const Impl::ViewCtorProp< P ... > & arg_prop + , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer + , typename traits::array_layout + >::type const & arg_layout + ) + : m_track() // No memory tracking + , m_map( arg_prop , arg_layout ) + { + static_assert( + std::is_same< pointer_type + , typename Impl::ViewCtorProp< P... >::pointer_type + >::value , + "Constructing View to wrap user memory must supply matching pointer type" ); + } + + // Simple dimension-only layout + template< class ... P > + explicit inline + View( const Impl::ViewCtorProp< P ... > & arg_prop + , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer + , size_t + >::type const arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 + ) + : View( arg_prop + , typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) + ) + {} + + template< class ... P > + explicit KOKKOS_INLINE_FUNCTION + View( const Impl::ViewCtorProp< P ... > & arg_prop + , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer + , size_t + >::type const arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 + ) + : View( arg_prop + , typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) + ) + {} + + // Allocate with label and layout + template< typename Label > + explicit inline + View( const Label & arg_label + , typename std::enable_if< + Kokkos::Experimental::Impl::is_view_label<Label>::value , + typename traits::array_layout >::type const & arg_layout + ) + : View( Impl::ViewCtorProp< std::string >( arg_label ) , arg_layout ) + {} + + // Allocate label and layout, must disambiguate from subview constructor. + template< typename Label > + explicit inline + View( const Label & arg_label + , typename std::enable_if< + Kokkos::Experimental::Impl::is_view_label<Label>::value , + const size_t >::type arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 + ) + : View( Impl::ViewCtorProp< std::string >( arg_label ) + , typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) + ) + {} + + // For backward compatibility + explicit inline + View( const ViewAllocateWithoutInitializing & arg_prop + , const typename traits::array_layout & arg_layout + ) + : View( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing ) + , arg_layout + ) + {} + + explicit inline + View( const ViewAllocateWithoutInitializing & arg_prop + , const size_t arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 + ) + : View( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing ) + , typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) + ) + {} + + //---------------------------------------- + // Memory span required to wrap these dimensions. + static constexpr size_t required_allocation_size( + const size_t arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 + ) + { + return map_type::memory_span( + typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ); + } + + explicit KOKKOS_INLINE_FUNCTION + View( pointer_type arg_ptr + , const size_t arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 + ) + : View( Impl::ViewCtorProp<pointer_type>(arg_ptr) + , typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) + ) + {} + + explicit KOKKOS_INLINE_FUNCTION + View( pointer_type arg_ptr + , const typename traits::array_layout & arg_layout + ) + : View( Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_layout ) + {} + + //---------------------------------------- + // Shared scratch memory constructor + + static inline + size_t shmem_size( const size_t arg_N0 = ~size_t(0) , + const size_t arg_N1 = ~size_t(0) , + const size_t arg_N2 = ~size_t(0) , + const size_t arg_N3 = ~size_t(0) , + const size_t arg_N4 = ~size_t(0) , + const size_t arg_N5 = ~size_t(0) , + const size_t arg_N6 = ~size_t(0) , + const size_t arg_N7 = ~size_t(0) ) + { + const size_t num_passed_args = + ( arg_N0 != ~size_t(0) ) + ( arg_N1 != ~size_t(0) ) + ( arg_N2 != ~size_t(0) ) + + ( arg_N3 != ~size_t(0) ) + ( arg_N4 != ~size_t(0) ) + ( arg_N5 != ~size_t(0) ) + + ( arg_N6 != ~size_t(0) ) + ( arg_N7 != ~size_t(0) ); + + if ( std::is_same<typename traits::specialize,void>::value && num_passed_args != traits::rank_dynamic ) { + Kokkos::abort( "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n" ); + } + + return map_type::memory_span( + typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ); + } + + explicit KOKKOS_INLINE_FUNCTION + View( const typename traits::execution_space::scratch_memory_space & arg_space + , const typename traits::array_layout & arg_layout ) + : View( Impl::ViewCtorProp<pointer_type>( + reinterpret_cast<pointer_type>( + arg_space.get_shmem( map_type::memory_span( arg_layout ) ) ) ) + , arg_layout ) + {} + + explicit KOKKOS_INLINE_FUNCTION + View( const typename traits::execution_space::scratch_memory_space & arg_space + , const size_t arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 ) + : View( Impl::ViewCtorProp<pointer_type>( + reinterpret_cast<pointer_type>( + arg_space.get_shmem( + map_type::memory_span( + typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) ) + , typename traits::array_layout + ( arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) + ) + {} }; -} /* namespace Kokkos */ + + /** \brief Temporary free function rank() + * until rank() is implemented + * in the View + */ + template < typename D , class ... P > + KOKKOS_INLINE_FUNCTION + constexpr unsigned rank( const View<D , P...> & V ) { return V.Rank; } //Temporary until added to view //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { +template< class V , class ... Args > +using Subview = + typename Kokkos::Experimental::Impl::ViewMapping + < void /* deduce subview type from source view traits */ + , typename V::traits + , Args ... + >::type ; -template< class LT , class LL , class LD , class LM , class LS , - class RT , class RL , class RD , class RM , class RS > +template< class D, class ... P , class ... Args > KOKKOS_INLINE_FUNCTION -typename Impl::enable_if<( Impl::is_same< LS , RS >::value ), bool >::type -operator == ( const View<LT,LL,LD,LM,LS> & lhs , - const View<RT,RL,RD,RM,RS> & rhs ) +typename Kokkos::Experimental::Impl::ViewMapping + < void /* deduce subview type from source view traits */ + , ViewTraits< D , P... > + , Args ... + >::type +subview( const View< D, P... > & src , Args ... args ) { - // Same data, layout, dimensions - typedef ViewTraits<LT,LL,LD,LM> lhs_traits ; - typedef ViewTraits<RT,RL,RD,RM> rhs_traits ; - - return - Impl::is_same< typename lhs_traits::const_data_type , - typename rhs_traits::const_data_type >::value && - Impl::is_same< typename lhs_traits::array_layout , - typename rhs_traits::array_layout >::value && - Impl::is_same< typename lhs_traits::memory_space , - typename rhs_traits::memory_space >::value && - Impl::is_same< typename lhs_traits::specialize , - typename rhs_traits::specialize >::value && - lhs.ptr_on_device() == rhs.ptr_on_device() && - lhs.shape() == rhs.shape() ; + static_assert( View< D , P... >::Rank == sizeof...(Args) , + "subview requires one argument for each source View rank" ); + + return typename + Kokkos::Experimental::Impl::ViewMapping + < void /* deduce subview type from source view traits */ + , ViewTraits< D , P ... > + , Args ... >::type( src , args ... ); } -template< class LT , class LL , class LD , class LM , class LS , - class RT , class RL , class RD , class RM , class RS > +template< class MemoryTraits , class D, class ... P , class ... Args > KOKKOS_INLINE_FUNCTION -bool operator != ( const View<LT,LL,LD,LM,LS> & lhs , - const View<RT,RL,RD,RM,RS> & rhs ) +typename Kokkos::Experimental::Impl::ViewMapping + < void /* deduce subview type from source view traits */ + , ViewTraits< D , P... > + , Args ... + >::template apply< MemoryTraits >::type +subview( const View< D, P... > & src , Args ... args ) { - return ! operator==( lhs , rhs ); + static_assert( View< D , P... >::Rank == sizeof...(Args) , + "subview requires one argument for each source View rank" ); + + return typename + Kokkos::Experimental::Impl::ViewMapping + < void /* deduce subview type from source view traits */ + , ViewTraits< D , P ... > + , Args ... > + ::template apply< MemoryTraits > + ::type( src , args ... ); } -//---------------------------------------------------------------------------- -} // namespace Kokkos +} /* namespace Experimental */ +} /* namespace Kokkos */ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { +namespace Experimental { -//---------------------------------------------------------------------------- -/** \brief Deep copy a value into a view. - */ -template< class DT , class DL , class DD , class DM , class DS > -inline -void deep_copy( const View<DT,DL,DD,DM,DS> & dst , - typename Impl::enable_if<( - Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::non_const_value_type , - typename ViewTraits<DT,DL,DD,DM>::value_type >::value - ), typename ViewTraits<DT,DL,DD,DM>::const_value_type >::type & value ) +template< class LT , class ... LP , class RT , class ... RP > +KOKKOS_INLINE_FUNCTION +bool operator == ( const View<LT,LP...> & lhs , + const View<RT,RP...> & rhs ) { - Impl::ViewFill< View<DT,DL,DD,DM,DS> >( dst , value ); + // Same data, layout, dimensions + typedef ViewTraits<LT,LP...> lhs_traits ; + typedef ViewTraits<RT,RP...> rhs_traits ; + + return + std::is_same< typename lhs_traits::const_value_type , + typename rhs_traits::const_value_type >::value && + std::is_same< typename lhs_traits::array_layout , + typename rhs_traits::array_layout >::value && + std::is_same< typename lhs_traits::memory_space , + typename rhs_traits::memory_space >::value && + unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && + lhs.data() == rhs.data() && + lhs.span() == rhs.span() && + lhs.dimension_0() == rhs.dimension_0() && + lhs.dimension_1() == rhs.dimension_1() && + lhs.dimension_2() == rhs.dimension_2() && + lhs.dimension_3() == rhs.dimension_3() && + lhs.dimension_4() == rhs.dimension_4() && + lhs.dimension_5() == rhs.dimension_5() && + lhs.dimension_6() == rhs.dimension_6() && + lhs.dimension_7() == rhs.dimension_7(); } -template< class ST , class SL , class SD , class SM , class SS > -inline -typename Impl::enable_if<( ViewTraits<ST,SL,SD,SM>::rank == 0 )>::type -deep_copy( ST & dst , const View<ST,SL,SD,SM,SS> & src ) +template< class LT , class ... LP , class RT , class ... RP > +KOKKOS_INLINE_FUNCTION +bool operator != ( const View<LT,LP...> & lhs , + const View<RT,RP...> & rhs ) { - typedef ViewTraits<ST,SL,SD,SM> src_traits ; - typedef typename src_traits::memory_space src_memory_space ; - Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.ptr_on_device() , sizeof(ST) ); + return ! ( operator==(lhs,rhs) ); } +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -/** \brief A deep copy between views of compatible type, and rank zero. - */ -template< class DT , class DL , class DD , class DM , class DS , - class ST , class SL , class SD , class SM , class SS > -inline -void deep_copy( const View<DT,DL,DD,DM,DS> & dst , - const View<ST,SL,SD,SM,SS> & src , - typename Impl::enable_if<( - // Same type and destination is not constant: - Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type , - typename View<ST,SL,SD,SM,SS>::non_const_value_type >::value - && - // Rank zero: - ( unsigned(View<DT,DL,DD,DM,DS>::rank) == unsigned(0) ) && - ( unsigned(View<ST,SL,SD,SM,SS>::rank) == unsigned(0) ) - )>::type * = 0 ) -{ - typedef View<DT,DL,DD,DM,DS> dst_type ; - typedef View<ST,SL,SD,SM,SS> src_type ; - typedef typename dst_type::memory_space dst_memory_space ; - typedef typename src_type::memory_space src_memory_space ; - typedef typename src_type::value_type value_type ; +namespace Kokkos { +namespace Impl { - if ( dst.ptr_on_device() != src.ptr_on_device() ) { - Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , sizeof(value_type) ); - } -} +inline +void shared_allocation_tracking_claim_and_disable() +{ Kokkos::Experimental::Impl::SharedAllocationRecord<void,void>::tracking_claim_and_disable(); } -//---------------------------------------------------------------------------- -/** \brief A deep copy between views of the default specialization, compatible type, - * same non-zero rank, same contiguous layout. - */ -template< class DT , class DL , class DD , class DM , - class ST , class SL , class SD , class SM > inline -void deep_copy( const View<DT,DL,DD,DM,Impl::ViewDefault> & dst , - const View<ST,SL,SD,SM,Impl::ViewDefault> & src , - typename Impl::enable_if<( - // Same type and destination is not constant: - Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::value_type , - typename View<ST,SL,SD,SM,Impl::ViewDefault>::non_const_value_type >::value - && - // Same non-zero rank: - ( unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) == - unsigned(View<ST,SL,SD,SM,Impl::ViewDefault>::rank) ) - && - ( 0 < unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) ) - && - // Same layout: - Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , - typename View<ST,SL,SD,SM,Impl::ViewDefault>::array_layout >::value - )>::type * = 0 ) -{ - typedef View<DT,DL,DD,DM,Impl::ViewDefault> dst_type ; - typedef View<ST,SL,SD,SM,Impl::ViewDefault> src_type ; +void shared_allocation_tracking_release_and_enable() +{ Kokkos::Experimental::Impl::SharedAllocationRecord<void,void>::tracking_release_and_enable(); } - typedef typename dst_type::memory_space dst_memory_space ; - typedef typename src_type::memory_space src_memory_space ; +} /* namespace Impl */ +} /* namespace Kokkos */ - enum { is_contiguous = // Contiguous (e.g., non-strided, non-tiled) layout - Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutLeft >::value || - Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutRight >::value }; +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- - if ( dst.ptr_on_device() != src.ptr_on_device() ) { +namespace Kokkos { +namespace Experimental { +namespace Impl { - // Same shape (dimensions) +template< class OutputView , typename Enable = void > +struct ViewFill { - const bool shapes_are_equal = dst.shape() == src.shape(); + typedef typename OutputView::const_value_type const_value_type ; - if ( shapes_are_equal && is_contiguous && dst.capacity() == src.capacity() ) { + const OutputView output ; + const_value_type input ; - // Views span equal length contiguous range. - // Assuming can perform a straight memory copy over this range. + KOKKOS_INLINE_FUNCTION + void operator()( const size_t i0 ) const + { + const size_t n1 = output.dimension_1(); + const size_t n2 = output.dimension_2(); + const size_t n3 = output.dimension_3(); + const size_t n4 = output.dimension_4(); + const size_t n5 = output.dimension_5(); + const size_t n6 = output.dimension_6(); + const size_t n7 = output.dimension_7(); + + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) { + for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) { + for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) { + for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) { + for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) { + output(i0,i1,i2,i3,i4,i5,i6,i7) = input ; + }}}}}}} + } - const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity(); + ViewFill( const OutputView & arg_out , const_value_type & arg_in ) + : output( arg_out ), input( arg_in ) + { + typedef typename OutputView::execution_space execution_space ; + typedef Kokkos::RangePolicy< execution_space > Policy ; - Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes ); - } - else { - // Destination view's execution space must be able to directly access source memory space - // in order for the ViewRemap functor run in the destination memory space's execution space. - size_t stride[8]; - src.stride(stride); - size_t size_stride = stride[0]*src.dimension_0(); - size_t size_dim = src.dimension_0(); - for(int i = 1; i<src.rank; i++) { - if(stride[i]*src.dimension(i)>size_stride) - size_stride = stride[i]*src.dimension(i); - size_dim*=src.dimension(i); - } + const Kokkos::Impl::ParallelFor< ViewFill , Policy > closure( *this , Policy( 0 , output.dimension_0() ) ); - if( shapes_are_equal && size_stride == size_dim) { - const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity(); + closure.execute(); - Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes ); - } else { - Impl::ViewRemap< dst_type , src_type >( dst , src ); - } + execution_space::fence(); } - } -} - +}; -/** \brief Deep copy equal dimension arrays in the same space which - * have different layouts or specializations. - */ -template< class DT , class DL , class DD , class DM , class DS , - class ST , class SL , class SD , class SM , class SS > -inline -void deep_copy( const View< DT, DL, DD, DM, DS > & dst , - const View< ST, SL, SD, SM, SS > & src , - const typename Impl::enable_if<( - // Same type and destination is not constant: - Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type , - typename View<DT,DL,DD,DM,DS>::non_const_value_type >::value - && - // Source memory space is accessible to destination memory space - Impl::VerifyExecutionCanAccessMemorySpace< typename View<DT,DL,DD,DM,DS>::memory_space - , typename View<ST,SL,SD,SM,SS>::memory_space >::value - && - // Same non-zero rank - ( unsigned( View<DT,DL,DD,DM,DS>::rank ) == - unsigned( View<ST,SL,SD,SM,SS>::rank ) ) - && - ( 0 < unsigned( View<DT,DL,DD,DM,DS>::rank ) ) - && - // Different layout or different specialization: - ( ( ! Impl::is_same< typename View<DT,DL,DD,DM,DS>::array_layout , - typename View<ST,SL,SD,SM,SS>::array_layout >::value ) - || - ( ! Impl::is_same< DS , SS >::value ) - ) - )>::type * = 0 ) -{ - typedef View< DT, DL, DD, DM, DS > dst_type ; - typedef View< ST, SL, SD, SM, SS > src_type ; +template< class OutputView > +struct ViewFill< OutputView , typename std::enable_if< OutputView::Rank == 0 >::type > { + ViewFill( const OutputView & dst , const typename OutputView::const_value_type & src ) + { + Kokkos::Impl::DeepCopy< typename OutputView::memory_space , Kokkos::HostSpace > + ( dst.data() , & src , sizeof(typename OutputView::const_value_type) ); + } +}; - assert_shapes_equal_dimension( dst.shape() , src.shape() ); +template< class OutputView , class InputView , class ExecSpace = typename OutputView::execution_space > +struct ViewRemap { + + const OutputView output ; + const InputView input ; + const size_t n0 ; + const size_t n1 ; + const size_t n2 ; + const size_t n3 ; + const size_t n4 ; + const size_t n5 ; + const size_t n6 ; + const size_t n7 ; + + ViewRemap( const OutputView & arg_out , const InputView & arg_in ) + : output( arg_out ), input( arg_in ) + , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) ) + , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) ) + , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) ) + , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) ) + , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) ) + , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) ) + , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) ) + , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) ) + { + typedef Kokkos::RangePolicy< ExecSpace > Policy ; + const Kokkos::Impl::ParallelFor< ViewRemap , Policy > closure( *this , Policy( 0 , n0 ) ); + closure.execute(); + } - Impl::ViewRemap< dst_type , src_type >( dst , src ); -} + KOKKOS_INLINE_FUNCTION + void operator()( const size_t i0 ) const + { + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) { + for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) { + for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) { + for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) { + for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) { + output(i0,i1,i2,i3,i4,i5,i6,i7) = input(i0,i1,i2,i3,i4,i5,i6,i7); + }}}}}}} + } +}; -} +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { +namespace Experimental { -//---------------------------------------------------------------------------- -/** \brief Deep copy a value into a view. - */ -template< class ExecSpace, class DT , class DL , class DD , class DM , class DS > +/** \brief Deep copy a value from Host memory into a view. */ +template< class DT , class ... DP > inline -void deep_copy( const ExecSpace&, const View<DT,DL,DD,DM,DS> & dst , - typename Impl::enable_if<( - Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::non_const_value_type , - typename ViewTraits<DT,DL,DD,DM>::value_type >::value - ), typename ViewTraits<DT,DL,DD,DM>::const_value_type >::type & value ) +void deep_copy + ( const View<DT,DP...> & dst + , typename ViewTraits<DT,DP...>::const_value_type & value + , typename std::enable_if< + std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value + >::type * = 0 ) { - Impl::ViewFill< View<DT,DL,DD,DM,DS> >( dst , value ); + static_assert( + std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type , + typename ViewTraits<DT,DP...>::value_type >::value + , "deep_copy requires non-const type" ); + + Kokkos::Experimental::Impl::ViewFill< View<DT,DP...> >( dst , value ); } -template< class ExecSpace, class ST , class SL , class SD , class SM , class SS > +/** \brief Deep copy into a value in Host memory from a view. */ +template< class ST , class ... SP > inline -typename Impl::enable_if<( ViewTraits<ST,SL,SD,SM>::rank == 0 )>::type -deep_copy( const ExecSpace& exec, ST & dst , const View<ST,SL,SD,SM,SS> & src ) +void deep_copy + ( typename ViewTraits<ST,SP...>::non_const_value_type & dst + , const View<ST,SP...> & src + , typename std::enable_if< + std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value + >::type * = 0 ) { - typedef ViewTraits<ST,SL,SD,SM> src_traits ; + static_assert( ViewTraits<ST,SP...>::rank == 0 + , "ERROR: Non-rank-zero view in deep_copy( value , View )" ); + + typedef ViewTraits<ST,SP...> src_traits ; typedef typename src_traits::memory_space src_memory_space ; - Impl::DeepCopy< HostSpace , src_memory_space , ExecSpace >( exec , & dst , src.ptr_on_device() , sizeof(ST) ); + Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) ); } //---------------------------------------------------------------------------- -/** \brief A deep copy between views of compatible type, and rank zero. - */ -template< class ExecSpace , - class DT , class DL , class DD , class DM , class DS , - class ST , class SL , class SD , class SM , class SS > +/** \brief A deep copy between views of compatible type, and rank zero. */ +template< class DT , class ... DP , class ST , class ... SP > inline -void deep_copy( const ExecSpace& exec, - const View<DT,DL,DD,DM,DS> & dst , - const View<ST,SL,SD,SM,SS> & src , - typename Impl::enable_if<( - // Same type and destination is not constant: - Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type , - typename View<ST,SL,SD,SM,SS>::non_const_value_type >::value - && - // Rank zero: - ( unsigned(View<DT,DL,DD,DM,DS>::rank) == unsigned(0) ) && - ( unsigned(View<ST,SL,SD,SM,SS>::rank) == unsigned(0) ) - )>::type * = 0 ) +void deep_copy + ( const View<DT,DP...> & dst + , const View<ST,SP...> & src + , typename std::enable_if<( + std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value && + std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value && + ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) && + unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) ) + )>::type * = 0 ) { - typedef View<DT,DL,DD,DM,DS> dst_type ; - typedef View<ST,SL,SD,SM,SS> src_type ; + static_assert( + std::is_same< typename ViewTraits<DT,DP...>::value_type , + typename ViewTraits<ST,SP...>::non_const_value_type >::value + , "deep_copy requires matching non-const destination type" ); + + typedef View<DT,DP...> dst_type ; + typedef View<ST,SP...> src_type ; + typedef typename dst_type::value_type value_type ; typedef typename dst_type::memory_space dst_memory_space ; typedef typename src_type::memory_space src_memory_space ; - typedef typename src_type::value_type value_type ; - if ( dst.ptr_on_device() != src.ptr_on_device() ) { - Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >( exec , dst.ptr_on_device() , src.ptr_on_device() , sizeof(value_type) ); + if ( dst.data() != src.data() ) { + Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) ); } } @@ -1632,172 +1799,395 @@ void deep_copy( const ExecSpace& exec, /** \brief A deep copy between views of the default specialization, compatible type, * same non-zero rank, same contiguous layout. */ -template< class ExecSpace , - class DT , class DL , class DD , class DM , - class ST , class SL , class SD , class SM > +template< class DT , class ... DP , class ST , class ... SP > inline -void deep_copy( const ExecSpace & exec, - const View<DT,DL,DD,DM,Impl::ViewDefault> & dst , - const View<ST,SL,SD,SM,Impl::ViewDefault> & src , - typename Impl::enable_if<( - // Same type and destination is not constant: - Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::value_type , - typename View<ST,SL,SD,SM,Impl::ViewDefault>::non_const_value_type >::value - && - // Same non-zero rank: - ( unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) == - unsigned(View<ST,SL,SD,SM,Impl::ViewDefault>::rank) ) - && - ( 0 < unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) ) - && - // Same layout: - Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , - typename View<ST,SL,SD,SM,Impl::ViewDefault>::array_layout >::value - )>::type * = 0 ) +void deep_copy + ( const View<DT,DP...> & dst + , const View<ST,SP...> & src + , typename std::enable_if<( + std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value && + std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value && + ( unsigned(ViewTraits<DT,DP...>::rank) != 0 || + unsigned(ViewTraits<ST,SP...>::rank) != 0 ) + )>::type * = 0 ) { - typedef View<DT,DL,DD,DM,Impl::ViewDefault> dst_type ; - typedef View<ST,SL,SD,SM,Impl::ViewDefault> src_type ; + static_assert( + std::is_same< typename ViewTraits<DT,DP...>::value_type , + typename ViewTraits<DT,DP...>::non_const_value_type >::value + , "deep_copy requires non-const destination type" ); + + static_assert( + ( unsigned(ViewTraits<DT,DP...>::rank) == + unsigned(ViewTraits<ST,SP...>::rank) ) + , "deep_copy requires Views of equal rank" ); + + typedef View<DT,DP...> dst_type ; + typedef View<ST,SP...> src_type ; + + typedef typename dst_type::execution_space dst_execution_space ; + typedef typename src_type::execution_space src_execution_space ; + typedef typename dst_type::memory_space dst_memory_space ; + typedef typename src_type::memory_space src_memory_space ; + + enum { DstExecCanAccessSrc = + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value }; + + enum { SrcExecCanAccessDst = + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value }; + + + if ( (void *) dst.data() != (void*) src.data() ) { + + // Concern: If overlapping views then a parallel copy will be erroneous. + // ... + + // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy + + if ( std::is_same< typename ViewTraits<DT,DP...>::value_type , + typename ViewTraits<ST,SP...>::non_const_value_type >::value && + ( + ( std::is_same< typename ViewTraits<DT,DP...>::array_layout , + typename ViewTraits<ST,SP...>::array_layout >::value + && + ( std::is_same< typename ViewTraits<DT,DP...>::array_layout , + typename Kokkos::LayoutLeft>::value + || + std::is_same< typename ViewTraits<DT,DP...>::array_layout , + typename Kokkos::LayoutRight>::value + ) + ) + || + ( ViewTraits<DT,DP...>::rank == 1 && + ViewTraits<ST,SP...>::rank == 1 ) + ) && + dst.span_is_contiguous() && + src.span_is_contiguous() && + dst.span() == src.span() && + dst.dimension_0() == src.dimension_0() && + dst.dimension_1() == src.dimension_1() && + dst.dimension_2() == src.dimension_2() && + dst.dimension_3() == src.dimension_3() && + dst.dimension_4() == src.dimension_4() && + dst.dimension_5() == src.dimension_5() && + dst.dimension_6() == src.dimension_6() && + dst.dimension_7() == src.dimension_7() ) { + + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + + Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes ); + } + else if ( std::is_same< typename ViewTraits<DT,DP...>::value_type , + typename ViewTraits<ST,SP...>::non_const_value_type >::value && + ( + ( std::is_same< typename ViewTraits<DT,DP...>::array_layout , + typename ViewTraits<ST,SP...>::array_layout >::value + && + std::is_same< typename ViewTraits<DT,DP...>::array_layout , + typename Kokkos::LayoutStride>::value + ) + || + ( ViewTraits<DT,DP...>::rank == 1 && + ViewTraits<ST,SP...>::rank == 1 ) + ) && + dst.span_is_contiguous() && + src.span_is_contiguous() && + dst.span() == src.span() && + dst.dimension_0() == src.dimension_0() && + dst.dimension_1() == src.dimension_1() && + dst.dimension_2() == src.dimension_2() && + dst.dimension_3() == src.dimension_3() && + dst.dimension_4() == src.dimension_4() && + dst.dimension_5() == src.dimension_5() && + dst.dimension_6() == src.dimension_6() && + dst.dimension_7() == src.dimension_7() && + dst.stride_0() == src.stride_0() && + dst.stride_1() == src.stride_1() && + dst.stride_2() == src.stride_2() && + dst.stride_3() == src.stride_3() && + dst.stride_4() == src.stride_4() && + dst.stride_5() == src.stride_5() && + dst.stride_6() == src.stride_6() && + dst.stride_7() == src.stride_7() + ) { + + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + + Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes ); + } + else if ( DstExecCanAccessSrc ) { + // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. + Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src ); + } + else if ( SrcExecCanAccessDst ) { + // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. + Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type , src_execution_space >( dst , src ); + } + else { + Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation"); + } + } +} - typedef typename dst_type::memory_space dst_memory_space ; - typedef typename src_type::memory_space src_memory_space ; +} /* namespace Experimental */ +} /* namespace Kokkos */ - enum { is_contiguous = // Contiguous (e.g., non-strided, non-tiled) layout - Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutLeft >::value || - Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutRight >::value }; +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- - if ( dst.ptr_on_device() != src.ptr_on_device() ) { +namespace Kokkos { +namespace Experimental { - // Same shape (dimensions) +/** \brief Deep copy a value from Host memory into a view. */ +template< class ExecSpace ,class DT , class ... DP > +inline +void deep_copy + ( const ExecSpace & + , const View<DT,DP...> & dst + , typename ViewTraits<DT,DP...>::const_value_type & value + , typename std::enable_if< + Kokkos::Impl::is_execution_space< ExecSpace >::value && + std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value + >::type * = 0 ) +{ + static_assert( + std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type , + typename ViewTraits<DT,DP...>::value_type >::value + , "deep_copy requires non-const type" ); - const bool shapes_are_equal = dst.shape() == src.shape(); + Kokkos::Experimental::Impl::ViewFill< View<DT,DP...> >( dst , value ); +} - if ( shapes_are_equal && is_contiguous && dst.capacity() == src.capacity() ) { +/** \brief Deep copy into a value in Host memory from a view. */ +template< class ExecSpace , class ST , class ... SP > +inline +void deep_copy + ( const ExecSpace & exec_space + , typename ViewTraits<ST,SP...>::non_const_value_type & dst + , const View<ST,SP...> & src + , typename std::enable_if< + Kokkos::Impl::is_execution_space< ExecSpace >::value && + std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value + >::type * = 0 ) +{ + static_assert( ViewTraits<ST,SP...>::rank == 0 + , "ERROR: Non-rank-zero view in deep_copy( value , View )" ); - // Views span equal length contiguous range. - // Assuming can perform a straight memory copy over this range. + typedef ViewTraits<ST,SP...> src_traits ; + typedef typename src_traits::memory_space src_memory_space ; + Kokkos::Impl::DeepCopy< HostSpace , src_memory_space , ExecSpace > + ( exec_space , & dst , src.data() , sizeof(ST) ); +} - const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity(); +//---------------------------------------------------------------------------- +/** \brief A deep copy between views of compatible type, and rank zero. */ +template< class ExecSpace , class DT , class ... DP , class ST , class ... SP > +inline +void deep_copy + ( const ExecSpace & exec_space + , const View<DT,DP...> & dst + , const View<ST,SP...> & src + , typename std::enable_if<( + Kokkos::Impl::is_execution_space< ExecSpace >::value && + std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value && + std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value && + ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) && + unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) ) + )>::type * = 0 ) +{ + static_assert( + std::is_same< typename ViewTraits<DT,DP...>::value_type , + typename ViewTraits<ST,SP...>::non_const_value_type >::value + , "deep_copy requires matching non-const destination type" ); - Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >( exec , dst.ptr_on_device() , src.ptr_on_device() , nbytes ); - } - else { - // Destination view's execution space must be able to directly access source memory space - // in order for the ViewRemap functor run in the destination memory space's execution space. - size_t stride[8]; - src.stride(stride); - size_t size_stride = stride[0]*src.dimension_0(); - size_t size_dim = src.dimension_0(); - for(int i = 1; i<src.rank; i++) { - if(stride[i]*src.dimension(i)>size_stride) - size_stride = stride[i]*src.dimension(i); - size_dim*=src.dimension(i); - } + typedef View<DT,DP...> dst_type ; + typedef View<ST,SP...> src_type ; - if( shapes_are_equal && size_stride == size_dim) { - const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity(); + typedef typename dst_type::value_type value_type ; + typedef typename dst_type::memory_space dst_memory_space ; + typedef typename src_type::memory_space src_memory_space ; - Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >( exec , dst.ptr_on_device() , src.ptr_on_device() , nbytes ); - } else { - Impl::ViewRemap< dst_type , src_type >( dst , src ); - } - } + if ( dst.data() != src.data() ) { + Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace > + ( exec_space , dst.data() , src.data() , sizeof(value_type) ); } } - -/** \brief Deep copy equal dimension arrays in the same space which - * have different layouts or specializations. +//---------------------------------------------------------------------------- +/** \brief A deep copy between views of the default specialization, compatible type, + * same non-zero rank, same contiguous layout. */ -template< class ExecSpace , - class DT , class DL , class DD , class DM , class DS , - class ST , class SL , class SD , class SM , class SS > +template< class ExecSpace , class DT, class ... DP, class ST, class ... SP > inline -void deep_copy( const ExecSpace& , - const View< DT, DL, DD, DM, DS > & dst , - const View< ST, SL, SD, SM, SS > & src , - const typename Impl::enable_if<( - // Same type and destination is not constant: - Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type , - typename View<DT,DL,DD,DM,DS>::non_const_value_type >::value - && - // Source memory space is accessible to destination memory space - Impl::VerifyExecutionCanAccessMemorySpace< typename View<DT,DL,DD,DM,DS>::memory_space - , typename View<ST,SL,SD,SM,SS>::memory_space >::value - && - // Same non-zero rank - ( unsigned( View<DT,DL,DD,DM,DS>::rank ) == - unsigned( View<ST,SL,SD,SM,SS>::rank ) ) - && - ( 0 < unsigned( View<DT,DL,DD,DM,DS>::rank ) ) - && - // Different layout or different specialization: - ( ( ! Impl::is_same< typename View<DT,DL,DD,DM,DS>::array_layout , - typename View<ST,SL,SD,SM,SS>::array_layout >::value ) - || - ( ! Impl::is_same< DS , SS >::value ) - ) - )>::type * = 0 ) +void deep_copy + ( const ExecSpace & exec_space + , const View<DT,DP...> & dst + , const View<ST,SP...> & src + , typename std::enable_if<( + Kokkos::Impl::is_execution_space< ExecSpace >::value && + std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value && + std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value && + ( unsigned(ViewTraits<DT,DP...>::rank) != 0 || + unsigned(ViewTraits<ST,SP...>::rank) != 0 ) + )>::type * = 0 ) { - typedef View< DT, DL, DD, DM, DS > dst_type ; - typedef View< ST, SL, SD, SM, SS > src_type ; - - assert_shapes_equal_dimension( dst.shape() , src.shape() ); - - Impl::ViewRemap< dst_type , src_type >( dst , src ); + static_assert( + std::is_same< typename ViewTraits<DT,DP...>::value_type , + typename ViewTraits<DT,DP...>::non_const_value_type >::value + , "deep_copy requires non-const destination type" ); + + static_assert( + ( unsigned(ViewTraits<DT,DP...>::rank) == + unsigned(ViewTraits<ST,SP...>::rank) ) + , "deep_copy requires Views of equal rank" ); + + typedef View<DT,DP...> dst_type ; + typedef View<ST,SP...> src_type ; + + typedef typename dst_type::execution_space dst_execution_space ; + typedef typename src_type::execution_space src_execution_space ; + typedef typename dst_type::memory_space dst_memory_space ; + typedef typename src_type::memory_space src_memory_space ; + + enum { DstExecCanAccessSrc = + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value }; + + enum { SrcExecCanAccessDst = + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value }; + + if ( (void *) dst.data() != (void*) src.data() ) { + + // Concern: If overlapping views then a parallel copy will be erroneous. + // ... + + // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy + + if ( std::is_same< typename ViewTraits<DT,DP...>::value_type , + typename ViewTraits<ST,SP...>::non_const_value_type >::value && + ( + std::is_same< typename ViewTraits<DT,DP...>::array_layout , + typename ViewTraits<ST,SP...>::array_layout >::value + || + ( ViewTraits<DT,DP...>::rank == 1 && + ViewTraits<ST,SP...>::rank == 1 ) + ) && + dst.span_is_contiguous() && + src.span_is_contiguous() && + dst.span() == src.span() && + dst.dimension_0() == src.dimension_0() && + dst.dimension_1() == src.dimension_1() && + dst.dimension_2() == src.dimension_2() && + dst.dimension_3() == src.dimension_3() && + dst.dimension_4() == src.dimension_4() && + dst.dimension_5() == src.dimension_5() && + dst.dimension_6() == src.dimension_6() && + dst.dimension_7() == src.dimension_7() ) { + + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + + Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace > + ( exec_space , dst.data() , src.data() , nbytes ); + } + else if ( DstExecCanAccessSrc ) { + // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. + Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src ); + } + else if ( SrcExecCanAccessDst ) { + // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. + Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type , src_execution_space >( dst , src ); + } + else { + Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation"); + } + } } -} +} /* namespace Experimental */ +} /* namespace Kokkos */ + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { +namespace Experimental { +namespace Impl { + +// Deduce Mirror Types +template<class Space, class T, class ... P> +struct MirrorViewType { + // The incoming view_type + typedef typename Kokkos::Experimental::View<T,P...> src_view_type; + // The memory space for the mirror view + typedef typename Space::memory_space memory_space; + // Check whether it is the same memory space + enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value }; + // The array_layout + typedef typename src_view_type::array_layout array_layout; + // The data type (we probably want it non-const since otherwise we can't even deep_copy to it. + typedef typename src_view_type::non_const_data_type data_type; + // The destination view type if it is not the same memory space + typedef Kokkos::Experimental::View<data_type,array_layout,Space> dest_view_type; + // If it is the same memory_space return the existsing view_type + // This will also keep the unmanaged trait if necessary + typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type; +}; + +template<class Space, class T, class ... P> +struct MirrorType { + // The incoming view_type + typedef typename Kokkos::Experimental::View<T,P...> src_view_type; + // The memory space for the mirror view + typedef typename Space::memory_space memory_space; + // Check whether it is the same memory space + enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value }; + // The array_layout + typedef typename src_view_type::array_layout array_layout; + // The data type (we probably want it non-const since otherwise we can't even deep_copy to it. + typedef typename src_view_type::non_const_data_type data_type; + // The destination view type if it is not the same memory space + typedef Kokkos::Experimental::View<data_type,array_layout,Space> view_type; +}; -template< class T , class L , class D , class M , class S > -typename Impl::enable_if<( - View<T,L,D,M,S>::is_managed && - !Impl::is_same<L,LayoutStride>::value - ), typename View<T,L,D,M,S>::HostMirror >::type +} + +template< class T , class ... P > inline -create_mirror( const View<T,L,D,M,S> & src ) +typename Kokkos::Experimental::View<T,P...>::HostMirror +create_mirror( const Kokkos::Experimental::View<T,P...> & src + , typename std::enable_if< + ! std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout + , Kokkos::LayoutStride >::value + >::type * = 0 + ) { - typedef View<T,L,D,M,S> view_type ; - typedef typename view_type::HostMirror host_view_type ; - - // 'view' is managed therefore we can allocate a - // compatible host_view through the ordinary constructor. - - std::string label = src.tracker().label(); - label.append("_mirror"); - - return host_view_type( label , - src.dimension_0() , - src.dimension_1() , - src.dimension_2() , - src.dimension_3() , - src.dimension_4() , - src.dimension_5() , - src.dimension_6() , - src.dimension_7() ); + typedef View<T,P...> src_type ; + typedef typename src_type::HostMirror dst_type ; + + return dst_type( std::string( src.label() ).append("_mirror") + , src.dimension_0() + , src.dimension_1() + , src.dimension_2() + , src.dimension_3() + , src.dimension_4() + , src.dimension_5() + , src.dimension_6() + , src.dimension_7() ); } -template< class T , class L , class D , class M , class S > -typename Impl::enable_if<( - View<T,L,D,M,S>::is_managed && - Impl::is_same<L,LayoutStride>::value - ), typename View<T,L,D,M,S>::HostMirror >::type +template< class T , class ... P > inline -create_mirror( const View<T,L,D,M,S> & src ) +typename Kokkos::Experimental::View<T,P...>::HostMirror +create_mirror( const Kokkos::Experimental::View<T,P...> & src + , typename std::enable_if< + std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout + , Kokkos::LayoutStride >::value + >::type * = 0 + ) { - typedef View<T,L,D,M,S> view_type ; - typedef typename view_type::HostMirror host_view_type ; + typedef View<T,P...> src_type ; + typedef typename src_type::HostMirror dst_type ; - // 'view' is managed therefore we can allocate a - // compatible host_view through the ordinary constructor. + Kokkos::LayoutStride layout ; - std::string label = src.tracker().label(); - label.append("_mirror"); - LayoutStride layout; - src.stride(layout.stride); layout.dimension[0] = src.dimension_0(); layout.dimension[1] = src.dimension_1(); layout.dimension[2] = src.dimension_2(); @@ -1807,37 +2197,91 @@ create_mirror( const View<T,L,D,M,S> & src ) layout.dimension[6] = src.dimension_6(); layout.dimension[7] = src.dimension_7(); - return host_view_type( label , layout ); + layout.stride[0] = src.stride_0(); + layout.stride[1] = src.stride_1(); + layout.stride[2] = src.stride_2(); + layout.stride[3] = src.stride_3(); + layout.stride[4] = src.stride_4(); + layout.stride[5] = src.stride_5(); + layout.stride[6] = src.stride_6(); + layout.stride[7] = src.stride_7(); + + return dst_type( std::string( src.label() ).append("_mirror") , layout ); +} + + +// Create a mirror in a new space (specialization for different space) +template<class Space, class T, class ... P> +typename Impl::MirrorType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::Experimental::View<T,P...> & src) { + return typename Impl::MirrorType<Space,T,P ...>::view_type(src.label(),src.layout()); } -template< class T , class L , class D , class M , class S > -typename Impl::enable_if<( - View<T,L,D,M,S>::is_managed && - Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value - ), typename View<T,L,D,M,S>::HostMirror >::type + +template< class T , class ... P > inline -create_mirror_view( const View<T,L,D,M,S> & src ) +typename Kokkos::Experimental::View<T,P...>::HostMirror +create_mirror_view( const Kokkos::Experimental::View<T,P...> & src + , typename std::enable_if<( + std::is_same< typename Kokkos::Experimental::View<T,P...>::memory_space + , typename Kokkos::Experimental::View<T,P...>::HostMirror::memory_space + >::value + && + std::is_same< typename Kokkos::Experimental::View<T,P...>::data_type + , typename Kokkos::Experimental::View<T,P...>::HostMirror::data_type + >::value + )>::type * = 0 + ) { return src ; } -template< class T , class L , class D , class M , class S > -typename Impl::enable_if<( - View<T,L,D,M,S>::is_managed && - ! Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value - ), typename View<T,L,D,M,S>::HostMirror >::type +template< class T , class ... P > inline -create_mirror_view( const View<T,L,D,M,S> & src ) +typename Kokkos::Experimental::View<T,P...>::HostMirror +create_mirror_view( const Kokkos::Experimental::View<T,P...> & src + , typename std::enable_if< ! ( + std::is_same< typename Kokkos::Experimental::View<T,P...>::memory_space + , typename Kokkos::Experimental::View<T,P...>::HostMirror::memory_space + >::value + && + std::is_same< typename Kokkos::Experimental::View<T,P...>::data_type + , typename Kokkos::Experimental::View<T,P...>::HostMirror::data_type + >::value + )>::type * = 0 + ) { - return create_mirror( src ); + return Kokkos::Experimental::create_mirror( src ); +} + +// Create a mirror view in a new space (specialization for same space) +template<class Space, class T, class ... P> +typename Impl::MirrorViewType<Space,T,P ...>::view_type +create_mirror_view(const Space& , const Kokkos::Experimental::View<T,P...> & src + , typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) { + return src; +} + +// Create a mirror view in a new space (specialization for different space) +template<class Space, class T, class ... P> +typename Impl::MirrorViewType<Space,T,P ...>::view_type +create_mirror_view(const Space& , const Kokkos::Experimental::View<T,P...> & src + , typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) { + return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout()); } +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +namespace Kokkos { +namespace Experimental { + /** \brief Resize a view with copying old data to new data at the corresponding indices. */ -template< class T , class L , class D , class M , class S > +template< class T , class ... P > inline -void resize( View<T,L,D,M,S> & v , - const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 , +void resize( Kokkos::Experimental::View<T,P...> & v , + const size_t n0 = 0 , const size_t n1 = 0 , const size_t n2 = 0 , const size_t n3 = 0 , @@ -1846,24 +2290,22 @@ void resize( View<T,L,D,M,S> & v , const size_t n6 = 0 , const size_t n7 = 0 ) { - typedef View<T,L,D,M,S> view_type ; + typedef Kokkos::Experimental::View<T,P...> view_type ; - const std::string label = v.tracker().label(); + static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only resize managed views" ); - view_type v_resized( label, n0, n1, n2, n3, n4, n5, n6, n7 ); + view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 ); - Impl::ViewRemap< view_type , view_type >( v_resized , v ); - - view_type::execution_space::fence(); + Kokkos::Experimental::Impl::ViewRemap< view_type , view_type >( v_resized , v ); v = v_resized ; } -/** \brief Reallocate a view without copying old data to new data */ -template< class T , class L , class D , class M , class S > +/** \brief Resize a view with copying old data to new data at the corresponding indices. */ +template< class T , class ... P > inline -void realloc( View<T,L,D,M,S> & v , - const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 , +void realloc( Kokkos::Experimental::View<T,P...> & v , + const size_t n0 = 0 , const size_t n1 = 0 , const size_t n2 = 0 , const size_t n3 = 0 , @@ -1872,239 +2314,71 @@ void realloc( View<T,L,D,M,S> & v , const size_t n6 = 0 , const size_t n7 = 0 ) { - typedef View<T,L,D,M,S> view_type ; + typedef Kokkos::Experimental::View<T,P...> view_type ; + + static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" ); - // Query the current label and reuse it. - const std::string label = v.tracker().label(); + const std::string label = v.label(); - v = view_type(); // deallocate first, if the only view to memory. + v = view_type(); // Deallocate first, if the only view to allocation v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 ); } -} // namespace Kokkos +} /* namespace Experimental */ +} /* namespace Kokkos */ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -template< class D , class A1 , class A2 , class A3 , class S , - class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , - class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 > -KOKKOS_INLINE_FUNCTION -typename Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , ArgType6 , ArgType7 - >::type -subview( const View<D,A1,A2,A3,S> & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 , - const ArgType3 & arg3 , - const ArgType4 & arg4 , - const ArgType5 & arg5 , - const ArgType6 & arg6 , - const ArgType7 & arg7 ) -{ - typedef typename - Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , ArgType6 , ArgType7 - >::type - DstViewType ; +template< class D , class ... P > +using ViewTraits = Kokkos::Experimental::ViewTraits<D,P...> ; - return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 ); -} +using Experimental::View ; //modified due to gcc parser bug +//template< class D , class ... P > +//using View = Kokkos::Experimental::View<D,P...> ; -template< class D , class A1 , class A2 , class A3 , class S , - class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , - class ArgType4 , class ArgType5 , class ArgType6 > -KOKKOS_INLINE_FUNCTION -typename Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , ArgType6 , void - >::type -subview( const View<D,A1,A2,A3,S> & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 , - const ArgType3 & arg3 , - const ArgType4 & arg4 , - const ArgType5 & arg5 , - const ArgType6 & arg6 ) -{ - typedef typename - Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , ArgType6 , void - >::type - DstViewType ; - - return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5, arg6 ); -} +using Kokkos::Experimental::ALL ; +using Kokkos::Experimental::WithoutInitializing ; +using Kokkos::Experimental::AllowPadding ; +using Kokkos::Experimental::view_alloc ; +using Kokkos::Experimental::view_wrap ; -template< class D , class A1 , class A2 , class A3 , class S , - class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , - class ArgType4 , class ArgType5 > -KOKKOS_INLINE_FUNCTION -typename Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , void , void - >::type -subview( const View<D,A1,A2,A3,S> & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 , - const ArgType3 & arg3 , - const ArgType4 & arg4 , - const ArgType5 & arg5 ) -{ - typedef typename - Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , void , void - >::type - DstViewType ; - - return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5 ); -} - -template< class D , class A1 , class A2 , class A3 , class S , - class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , - class ArgType4 > -KOKKOS_INLINE_FUNCTION -typename Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , void , void , void - >::type -subview( const View<D,A1,A2,A3,S> & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 , - const ArgType3 & arg3 , - const ArgType4 & arg4 ) -{ - typedef typename - Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , void , void , void - >::type - DstViewType ; - - return DstViewType( src, arg0, arg1, arg2, arg3, arg4 ); -} - -template< class D , class A1 , class A2 , class A3 , class S , - class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 > -KOKKOS_INLINE_FUNCTION -typename Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , void , void , void , void - >::type -subview( const View<D,A1,A2,A3,S> & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 , - const ArgType3 & arg3 ) -{ - typedef typename - Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , void , void , void , void - >::type - DstViewType ; - - return DstViewType( src, arg0, arg1, arg2, arg3 ); -} - -template< class D , class A1 , class A2 , class A3 , class S , - class ArgType0 , class ArgType1 , class ArgType2 > -KOKKOS_INLINE_FUNCTION -typename Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , void - , void , void , void , void - >::type -subview( const View<D,A1,A2,A3,S> & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 ) -{ - typedef typename - Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , ArgType2 , void - , void , void , void , void - >::type - DstViewType ; +using Kokkos::Experimental::deep_copy ; +using Kokkos::Experimental::create_mirror ; +using Kokkos::Experimental::create_mirror_view ; +using Kokkos::Experimental::subview ; +using Kokkos::Experimental::resize ; +using Kokkos::Experimental::realloc ; +using Kokkos::Experimental::is_view ; - return DstViewType( src, arg0, arg1, arg2 ); -} +namespace Impl { -template< class D , class A1 , class A2 , class A3 , class S , - class ArgType0 , class ArgType1 > -KOKKOS_INLINE_FUNCTION -typename Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , void , void - , void , void , void , void - >::type -subview( const View<D,A1,A2,A3,S> & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 ) -{ - typedef typename - Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , ArgType1 , void , void - , void , void , void , void - >::type - DstViewType ; +using Kokkos::Experimental::is_view ; - return DstViewType( src, arg0, arg1 ); -} +class ViewDefault {}; -template< class D , class A1 , class A2 , class A3 , class S , - class ArgType0 > -KOKKOS_INLINE_FUNCTION -typename Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , void , void , void - , void , void , void , void - >::type -subview( const View<D,A1,A2,A3,S> & src , - const ArgType0 & arg0 ) -{ - typedef typename - Impl::ViewSubview< View<D,A1,A2,A3,S> - , ArgType0 , void , void , void - , void , void , void , void - >::type - DstViewType ; +template< class SrcViewType + , class Arg0Type + , class Arg1Type + , class Arg2Type + , class Arg3Type + , class Arg4Type + , class Arg5Type + , class Arg6Type + , class Arg7Type + > +struct ViewSubview /* { typedef ... type ; } */ ; - return DstViewType( src, arg0 ); } -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +} /* namespace Kokkos */ -#include <impl/Kokkos_ViewDefault.hpp> #include <impl/Kokkos_Atomic_View.hpp> -#include <impl/Kokkos_ViewOffset.hpp> -#include <impl/Kokkos_ViewSupport.hpp> - -namespace Kokkos { -/** \brief Tag denoting that a subview should capture all of a dimension */ -struct ALL { KOKKOS_INLINE_FUNCTION ALL(){} }; -} - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - -#include <KokkosExp_View.hpp> - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif +#endif /* #ifndef KOKKOS_VIEW_HPP */ diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp index c58706bbaa..27ae5803ce 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -178,9 +178,10 @@ public: namespace Kokkos { namespace Impl { -template< class FunctorType , class ... Traits > +template< class FunctorType , class ReducerType, class ... Traits > class ParallelReduce< FunctorType , Kokkos::RangePolicy< Traits ...> + , ReducerType , Kokkos::OpenMP > { @@ -192,15 +193,21 @@ private: typedef typename Policy::WorkRange WorkRange ; typedef typename Policy::member_type Member ; - typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; - typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ; + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + // Static Assert WorkTag void if ReducerType not InvalidType + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; const FunctorType m_functor ; const Policy m_policy ; + const ReducerType m_reducer ; const pointer_type m_result_ptr ; template< class TagType > @@ -252,7 +259,7 @@ public: OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce"); OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce"); - OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 ); + OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 ); #pragma omp parallel { @@ -260,7 +267,7 @@ public: const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() ); ParallelReduce::template exec_range< WorkTag > ( m_functor , range.begin() , range.end() - , ValueInit::init( m_functor , exec.scratch_reduce() ) ); + , ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) ); } /* END #pragma omp parallel */ @@ -269,13 +276,13 @@ public: const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() ); for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) { - ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); + ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); } - Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); if ( m_result_ptr ) { - const int n = ValueTraits::value_count( m_functor ); + const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; } } @@ -289,7 +296,7 @@ public: OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce"); OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce"); - OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 ); + OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 ); #pragma omp parallel { @@ -302,7 +309,7 @@ public: long work_index = exec.get_work_index(); - reference_type update = ValueInit::init( m_functor , exec.scratch_reduce() ); + reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ); while(work_index != -1) { const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size(); const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end(); @@ -319,13 +326,13 @@ public: const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() ); for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) { - ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); + ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); } - Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); if ( m_result_ptr ) { - const int n = ValueTraits::value_count( m_functor ); + const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; } } @@ -337,18 +344,35 @@ public: inline ParallelReduce( const FunctorType & arg_functor , Policy arg_policy - , const ViewType & arg_result_view ) + , const ViewType & arg_result_view + , typename std::enable_if< + Kokkos::is_view< ViewType >::value && + !Kokkos::is_reducer_type<ReducerType>::value + ,void*>::type = NULL) : m_functor( arg_functor ) , m_policy( arg_policy ) - , m_result_ptr( arg_result_view.ptr_on_device() ) + , m_reducer( InvalidType() ) + , m_result_ptr( arg_result_view.data() ) { - static_assert( Kokkos::is_view< ViewType >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View" ); + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } - static_assert( std::is_same< typename ViewType::memory_space + inline + ParallelReduce( const FunctorType & arg_functor + , Policy arg_policy + , const ReducerType& reducer ) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + , m_reducer( reducer ) + , m_result_ptr( reducer.result_view().data() ) + { + /*static_assert( std::is_same< typename ViewType::memory_space , Kokkos::HostSpace >::value - , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" ); + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ } + }; } // namespace Impl @@ -568,13 +592,13 @@ public: const size_t team_reduce_size = Policy::member_type::team_reduce_size(); - OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size ); + OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1)); #pragma omp parallel { ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type> ( m_functor - , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size) ); + , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) ); } /* END #pragma omp parallel */ } @@ -584,14 +608,15 @@ public: const Policy & arg_policy ) : m_functor( arg_functor ) , m_policy( arg_policy ) - , m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) + , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) {} }; -template< class FunctorType , class ... Properties > +template< class FunctorType , class ReducerType, class ... Properties > class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Properties ... > + , ReducerType , Kokkos::OpenMP > { @@ -602,15 +627,19 @@ private: typedef typename Policy::work_tag WorkTag ; typedef typename Policy::member_type Member ; - typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ; - typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ; + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTag > ValueJoin ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; const FunctorType m_functor ; const Policy m_policy ; + const ReducerType m_reducer ; const pointer_type m_result_ptr ; const int m_shmem_size ; @@ -644,7 +673,7 @@ public: const size_t team_reduce_size = Policy::member_type::team_reduce_size(); - OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , team_reduce_size + m_shmem_size ); + OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size ); #pragma omp parallel { @@ -652,8 +681,8 @@ public: ParallelReduce::template exec_team< WorkTag > ( m_functor - , Member( exec , m_policy , m_shmem_size ) - , ValueInit::init( m_functor , exec.scratch_reduce() ) ); + , Member( exec , m_policy , m_shmem_size, 0 ) + , ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) ); } /* END #pragma omp parallel */ @@ -665,13 +694,13 @@ public: max_active_threads = m_policy.league_size()* m_policy.team_size(); for ( int i = 1 ; i < max_active_threads ; ++i ) { - ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); + ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); } - Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); if ( m_result_ptr ) { - const int n = ValueTraits::value_count( m_functor ); + const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; } } @@ -682,12 +711,33 @@ public: inline ParallelReduce( const FunctorType & arg_functor , const Policy & arg_policy , - const ViewType & arg_result ) + const ViewType & arg_result , + typename std::enable_if< + Kokkos::is_view< ViewType >::value && + !Kokkos::is_reducer_type<ReducerType>::value + ,void*>::type = NULL) : m_functor( arg_functor ) , m_policy( arg_policy ) + , m_reducer( InvalidType() ) , m_result_ptr( arg_result.ptr_on_device() ) - , m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) + , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) {} + + inline + ParallelReduce( const FunctorType & arg_functor + , Policy arg_policy + , const ReducerType& reducer ) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + , m_reducer( reducer ) + , m_result_ptr( reducer.result_view().data() ) + , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } + }; } // namespace Impl diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp new file mode 100644 index 0000000000..3e22033f7c --- /dev/null +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp @@ -0,0 +1,329 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY ) + +#include <impl/Kokkos_TaskQueue_impl.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template class TaskQueue< Kokkos::OpenMP > ; + +//---------------------------------------------------------------------------- + +TaskExec< Kokkos::OpenMP >:: +TaskExec() + : m_self_exec( 0 ) + , m_team_exec( 0 ) + , m_sync_mask( 0 ) + , m_sync_value( 0 ) + , m_sync_step( 0 ) + , m_group_rank( 0 ) + , m_team_rank( 0 ) + , m_team_size( 1 ) +{ +} + +TaskExec< Kokkos::OpenMP >:: +TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size ) + : m_self_exec( & arg_exec ) + , m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) ) + , m_sync_mask( 0 ) + , m_sync_value( 0 ) + , m_sync_step( 0 ) + , m_group_rank( arg_exec.pool_rank_rev() / arg_team_size ) + , m_team_rank( arg_exec.pool_rank_rev() % arg_team_size ) + , m_team_size( arg_team_size ) +{ + // This team spans + // m_self_exec->pool_rev( team_size * group_rank ) + // m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 ) + + int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce(); + + sync[0] = int64_t(0) ; + sync[1] = int64_t(0) ; + + for ( int i = 0 ; i < m_team_size ; ++i ) { + m_sync_value |= int64_t(1) << (8*i); + m_sync_mask |= int64_t(3) << (8*i); + } + + Kokkos::memory_fence(); +} + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + +void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const +{ + if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) { + Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small"); + } + + // Use team shared memory to synchronize. + // Alternate memory locations between barriers to avoid a sequence + // of barriers overtaking one another. + + int64_t volatile * const sync = + ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 ); + + // This team member sets one byte within the sync variable + int8_t volatile * const sync_self = + ((int8_t *) sync) + m_team_rank ; + +#if 0 +fprintf( stdout + , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n" + , m_group_rank + , m_team_rank + , m_sync_step + , m_sync_value + , *sync + ); +fflush(stdout); +#endif + + *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival + + while ( m_sync_value != *sync ); // wait for team to arrive + +#if 0 +fprintf( stdout + , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n" + , m_group_rank + , m_team_rank + , m_sync_step + , m_sync_value + , *sync + ); +fflush(stdout); +#endif + + ++m_sync_step ; + + if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step + m_sync_value ^= m_sync_mask ; + if ( 1000 < m_sync_step ) m_sync_step = 0 ; + } +} + +#endif + +//---------------------------------------------------------------------------- + +void TaskQueueSpecialization< Kokkos::OpenMP >::execute + ( TaskQueue< Kokkos::OpenMP > * const queue ) +{ + using execution_space = Kokkos::OpenMP ; + using queue_type = TaskQueue< execution_space > ; + using task_root_type = TaskBase< execution_space , void , void > ; + using PoolExec = Kokkos::Impl::OpenMPexec ; + using Member = TaskExec< execution_space > ; + + task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + + // Required: team_size <= 8 + + const int team_size = PoolExec::pool_size(2); // Threads per core + // const int team_size = PoolExec::pool_size(1); // Threads per NUMA + + if ( 8 < team_size ) { + Kokkos::abort("TaskQueue<OpenMP> unsupported team size"); + } + +#pragma omp parallel + { + PoolExec & self = *PoolExec::get_thread_omp(); + + Member single_exec ; + Member team_exec( self , team_size ); + + // Team shared memory + task_root_type * volatile * const task_shared = + (task_root_type **) team_exec.m_team_exec->scratch_thread(); + +// Barrier across entire OpenMP thread pool to insure initialization +#pragma omp barrier + + // Loop until all queues are empty and no tasks in flight + + do { + + task_root_type * task = 0 ; + + // Each team lead attempts to acquire either a thread team task + // or a single thread task for the team. + + if ( 0 == team_exec.team_rank() ) { + + task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ; + + // Loop by priority and then type + for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) { + for ( int j = 0 ; j < 2 && end == task ; ++j ) { + task = queue_type::pop_task( & queue->m_ready[i][j] ); + } + } + } + + // Team lead broadcast acquired task to team members: + + if ( 1 < team_exec.team_size() ) { + + if ( 0 == team_exec.team_rank() ) *task_shared = task ; + + // Fence to be sure task_shared is stored before the barrier + Kokkos::memory_fence(); + + // Whole team waits for every team member to reach this statement + team_exec.team_barrier(); + + // Fence to be sure task_shared is stored + Kokkos::memory_fence(); + + task = *task_shared ; + } + +#if 0 +fprintf( stdout + , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n" + , team_exec.m_group_rank + , team_exec.m_team_rank + , uintptr_t(task_shared) + , uintptr_t(task) + ); +fflush(stdout); +#endif + + if ( 0 == task ) break ; // 0 == m_ready_count + + if ( end == task ) { + // All team members wait for whole team to reach this statement. + // Is necessary to prevent task_shared from being updated + // before it is read by all threads. + team_exec.team_barrier(); + } + else if ( task_root_type::TaskTeam == task->m_task_type ) { + // Thread Team Task + (*task->m_apply)( task , & team_exec ); + + // The m_apply function performs a barrier + + if ( 0 == team_exec.team_rank() ) { + // team member #0 completes the task, which may delete the task + queue->complete( task ); + } + } + else { + // Single Thread Task + + if ( 0 == team_exec.team_rank() ) { + + (*task->m_apply)( task , & single_exec ); + + queue->complete( task ); + } + + // All team members wait for whole team to reach this statement. + // Not necessary to complete the task. + // Is necessary to prevent task_shared from being updated + // before it is read by all threads. + team_exec.team_barrier(); + } + } while(1); + } +// END #pragma omp parallel + +} + +void TaskQueueSpecialization< Kokkos::OpenMP >:: + iff_single_thread_recursive_execute + ( TaskQueue< Kokkos::OpenMP > * const queue ) +{ + using execution_space = Kokkos::OpenMP ; + using queue_type = TaskQueue< execution_space > ; + using task_root_type = TaskBase< execution_space , void , void > ; + using Member = TaskExec< execution_space > ; + + if ( 1 == omp_get_num_threads() ) { + + task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + + Member single_exec ; + + task_root_type * task = end ; + + do { + + task = end ; + + // Loop by priority and then type + for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) { + for ( int j = 0 ; j < 2 && end == task ; ++j ) { + task = queue_type::pop_task( & queue->m_ready[i][j] ); + } + } + + if ( end == task ) break ; + + (*task->m_apply)( task , & single_exec ); + + queue->complete( task ); + + } while(1); + } +} + +}} /* namespace Kokkos::Impl */ + +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */ + + diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp new file mode 100644 index 0000000000..2761247c40 --- /dev/null +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -0,0 +1,356 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP +#define KOKKOS_IMPL_OPENMP_TASK_HPP + +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template<> +class TaskQueueSpecialization< Kokkos::OpenMP > +{ +public: + + using execution_space = Kokkos::OpenMP ; + using queue_type = Kokkos::Impl::TaskQueue< execution_space > ; + using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ; + + // Must specify memory space + using memory_space = Kokkos::HostSpace ; + + static + void iff_single_thread_recursive_execute( queue_type * const ); + + // Must provide task queue execution function + static void execute( queue_type * const ); + + // Must provide mechanism to set function pointer in + // execution space from the host process. + template< typename FunctorType > + static + void proc_set_apply( task_base_type::function_type * ptr ) + { + using TaskType = TaskBase< Kokkos::OpenMP + , typename FunctorType::value_type + , FunctorType + > ; + *ptr = TaskType::apply ; + } +}; + +extern template class TaskQueue< Kokkos::OpenMP > ; + +//---------------------------------------------------------------------------- + +template<> +class TaskExec< Kokkos::OpenMP > +{ +private: + + TaskExec( TaskExec && ) = delete ; + TaskExec( TaskExec const & ) = delete ; + TaskExec & operator = ( TaskExec && ) = delete ; + TaskExec & operator = ( TaskExec const & ) = delete ; + + + using PoolExec = Kokkos::Impl::OpenMPexec ; + + friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ; + friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ; + + PoolExec * const m_self_exec ; ///< This thread's thread pool data structure + PoolExec * const m_team_exec ; ///< Team thread's thread pool data structure + int64_t m_sync_mask ; + int64_t mutable m_sync_value ; + int mutable m_sync_step ; + int m_group_rank ; ///< Which "team" subset of thread pool + int m_team_rank ; ///< Which thread within a team + int m_team_size ; + + TaskExec(); + TaskExec( PoolExec & arg_exec , int arg_team_size ); + + void team_barrier_impl() const ; + +public: + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + void * team_shared() const + { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; } + + int team_shared_size() const + { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; } + + /**\brief Whole team enters this function call + * before any teeam member returns from + * this function call. + */ + void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); } +#else + KOKKOS_INLINE_FUNCTION void team_barrier() const {} + KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; } + KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; } +#endif + + KOKKOS_INLINE_FUNCTION + int team_rank() const { return m_team_rank ; } + + KOKKOS_INLINE_FUNCTION + int team_size() const { return m_team_size ; } +}; + +}} /* namespace Kokkos::Impl */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > > +TeamThreadRange + ( Impl::TaskExec< Kokkos::OpenMP > & thread + , const iType & count ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count); +} + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > > +TeamThreadRange + ( Impl:: TaskExec< Kokkos::OpenMP > & thread + , const iType & start + , const iType & end ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >(thread,start,end); +} + +/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + * This functionality requires C++11 support. +*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for + ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries + , const Lambda& lambda + ) +{ + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i); + } +} + +template<typename iType, class Lambda, typename ValueType> +KOKKOS_INLINE_FUNCTION +void parallel_reduce + ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries + , const Lambda& lambda + , ValueType& initialized_result) +{ + int team_rank = loop_boundaries.thread.team_rank(); // member num within the team + ValueType result = initialized_result; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i, result); + } + + if ( 1 < loop_boundaries.thread.team_size() ) { + + ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared(); + + loop_boundaries.thread.team_barrier(); + shared[team_rank] = result; + + loop_boundaries.thread.team_barrier(); + + // reduce across threads to thread 0 + if (team_rank == 0) { + for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { + shared[0] += shared[i]; + } + } + + loop_boundaries.thread.team_barrier(); + + // broadcast result + initialized_result = shared[0]; + } + else { + initialized_result = result ; + } +} + +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries, + const Lambda & lambda, + const JoinType & join, + ValueType& initialized_result) +{ + int team_rank = loop_boundaries.thread.team_rank(); // member num within the team + ValueType result = initialized_result; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i, result); + } + + if ( 1 < loop_boundaries.thread.team_size() ) { + ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared(); + + loop_boundaries.thread.team_barrier(); + shared[team_rank] = result; + + loop_boundaries.thread.team_barrier(); + + // reduce across threads to thread 0 + if (team_rank == 0) { + for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { + join(shared[0], shared[i]); + } + } + + loop_boundaries.thread.team_barrier(); + + // broadcast result + initialized_result = shared[0]; + } + else { + initialized_result = result ; + } +} + +// placeholder for future function +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries, + const Lambda & lambda, + ValueType& initialized_result) +{ +} + +// placeholder for future function +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries, + const Lambda & lambda, + const JoinType & join, + ValueType& initialized_result) +{ +} + +template< typename ValueType, typename iType, class Lambda > +KOKKOS_INLINE_FUNCTION +void parallel_scan + (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries, + const Lambda & lambda) +{ + ValueType accum = 0 ; + ValueType val, local_total; + ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared(); + int team_size = loop_boundaries.thread.team_size(); + int team_rank = loop_boundaries.thread.team_rank(); // member num within the team + + // Intra-member scan + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + local_total = 0; + lambda(i,local_total,false); + val = accum; + lambda(i,val,true); + accum += local_total; + } + + shared[team_rank] = accum; + loop_boundaries.thread.team_barrier(); + + // Member 0 do scan on accumulated totals + if (team_rank == 0) { + for( iType i = 1; i < team_size; i+=1) { + shared[i] += shared[i-1]; + } + accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan + } + + loop_boundaries.thread.team_barrier(); + + // Inter-member scan adding in accumulated totals + if (team_rank != 0) { accum = shared[team_rank-1]; } + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + local_total = 0; + lambda(i,local_total,false); + val = accum; + lambda(i,val,true); + accum += local_total; + } +} + +// placeholder for future function +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_scan + (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries, + const Lambda & lambda) +{ +} + + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ +#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */ + diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp index f73f1e932a..7d06a2f661 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp @@ -49,6 +49,7 @@ #include <impl/Kokkos_Error.hpp> #include <iostream> #include <impl/Kokkos_CPUDiscovery.hpp> +#include <impl/Kokkos_Profiling_Interface.hpp> #ifdef KOKKOS_HAVE_OPENMP @@ -85,16 +86,8 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 }; int OpenMPexec::m_pool_topo[ 4 ] = { 0 }; -#if ! KOKKOS_USING_EXP_VIEW - -OpenMPexec::Pool OpenMPexec::m_pool; - -#else - OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 }; -#endif - void OpenMPexec::verify_is_process( const char * const label ) { if ( omp_in_parallel() ) { @@ -125,16 +118,12 @@ void OpenMPexec::clear_scratch() #pragma omp parallel { const int rank_rev = m_map_rank[ omp_get_thread_num() ]; -#if KOKKOS_USING_EXP_VIEW typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ; if ( m_pool[ rank_rev ] ) { Record * const r = Record::get_record( m_pool[ rank_rev ] ); m_pool[ rank_rev ] = 0 ; Record::decrement( r ); } -#else - m_pool.at(rank_rev).clear(); -#endif } /* END #pragma omp parallel */ } @@ -172,8 +161,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size ) const int rank_rev = m_map_rank[ omp_get_thread_num() ]; const int rank = pool_size - ( rank_rev + 1 ); -#if KOKKOS_USING_EXP_VIEW - typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ; Record * const r = Record::allocate( Kokkos::HostSpace() @@ -184,15 +171,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size ) m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() ); -#else - - #pragma omp critical - { - m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size ); - } - -#endif - new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size ); } /* END #pragma omp parallel */ @@ -330,6 +308,10 @@ void OpenMP::initialize( unsigned thread_count , } // Init the array for used for arbitrarily sized atomics Impl::init_lock_array_host_space(); + + #if (KOKKOS_ENABLE_PROFILING) + Kokkos::Profiling::initialize(); + #endif } //---------------------------------------------------------------------------- @@ -350,6 +332,10 @@ void OpenMP::finalize() if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) { hwloc::unbind_this_thread(); } + + #if (KOKKOS_ENABLE_PROFILING) + Kokkos::Profiling::finalize(); + #endif } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp index 723b2f9429..a01c9cb644 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp @@ -46,7 +46,6 @@ #include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_spinwait.hpp> -#include <impl/Kokkos_AllocationTracker.hpp> #include <Kokkos_Atomic.hpp> #include <iostream> @@ -63,38 +62,10 @@ public: enum { MAX_THREAD_COUNT = 4096 }; -#if ! KOKKOS_USING_EXP_VIEW - - struct Pool - { - Pool() : m_trackers() {} - - AllocationTracker m_trackers[ MAX_THREAD_COUNT ]; - - OpenMPexec * operator[](int i) - { - return reinterpret_cast<OpenMPexec *>(m_trackers[i].alloc_ptr()); - } - - AllocationTracker & at(int i) - { - return m_trackers[i]; - } - }; - - -private: - - static Pool m_pool; // Indexed by: m_pool_rank_rev - -#else - private: static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev -#endif - static int m_pool_topo[ 4 ]; static int m_map_rank[ MAX_THREAD_COUNT ]; @@ -145,6 +116,12 @@ public: inline long team_work_index() const { return m_team_work_index ; } + inline int scratch_reduce_size() const + { return m_scratch_reduce_end - m_scratch_exec_end ; } + + inline int scratch_thread_size() const + { return m_scratch_thread_end - m_scratch_reduce_end ; } + inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; } inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; } @@ -157,15 +134,15 @@ public: ~OpenMPexec() {} - OpenMPexec( const int poolRank - , const int scratch_exec_size - , const int scratch_reduce_size - , const int scratch_thread_size ) - : m_pool_rank( poolRank ) - , m_pool_rank_rev( pool_size() - ( poolRank + 1 ) ) - , m_scratch_exec_end( scratch_exec_size ) - , m_scratch_reduce_end( m_scratch_exec_end + scratch_reduce_size ) - , m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size ) + OpenMPexec( const int arg_poolRank + , const int arg_scratch_exec_size + , const int arg_scratch_reduce_size + , const int arg_scratch_thread_size ) + : m_pool_rank( arg_poolRank ) + , m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) ) + , m_scratch_exec_end( arg_scratch_exec_size ) + , m_scratch_reduce_end( m_scratch_exec_end + arg_scratch_reduce_size ) + , m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size ) , m_barrier_state(0) {} @@ -330,7 +307,7 @@ public: Impl::OpenMPexec & m_exec ; scratch_memory_space m_team_shared ; - int m_team_shmem ; + int m_team_scratch_size[2] ; int m_team_base_rev ; int m_team_rank_rev ; int m_team_rank ; @@ -378,15 +355,15 @@ public: KOKKOS_INLINE_FUNCTION const execution_space::scratch_memory_space& team_shmem() const - { return m_team_shared.set_team_thread_mode(1,0) ; } + { return m_team_shared.set_team_thread_mode(0,1,0) ; } KOKKOS_INLINE_FUNCTION const execution_space::scratch_memory_space& team_scratch(int) const - { return m_team_shared.set_team_thread_mode(1,0) ; } + { return m_team_shared.set_team_thread_mode(0,1,0) ; } KOKKOS_INLINE_FUNCTION const execution_space::scratch_memory_space& thread_scratch(int) const - { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; } + { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; } KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; } KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; } @@ -568,11 +545,12 @@ public: inline OpenMPexecTeamMember( Impl::OpenMPexec & exec , const TeamPolicyInternal< OpenMP, Properties ...> & team - , const int shmem_size + , const int shmem_size_L1 + , const int shmem_size_L2 ) : m_exec( exec ) , m_team_shared(0,0) - , m_team_shmem( shmem_size ) + , m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 } , m_team_base_rev(0) , m_team_rank_rev(0) , m_team_rank(0) @@ -580,7 +558,7 @@ public: , m_league_rank(0) , m_league_end(0) , m_league_size( team.league_size() ) - , m_chunk_size( team.chunk_size() ) + , m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() ) , m_league_chunk_end(0) , m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) )) , m_team_alloc( team.team_alloc()) @@ -589,10 +567,9 @@ public: const int pool_team_rank_rev = pool_rank_rev % team.team_alloc(); const int pool_league_rank_rev = pool_rank_rev / team.team_alloc(); const int pool_num_teams = OpenMP::thread_pool_size(0)/team.team_alloc(); - const int chunk_size = team.chunk_size()>0?team.chunk_size():team.team_iter(); - const int chunks_per_team = ( team.league_size() + chunk_size*pool_num_teams-1 ) / (chunk_size*pool_num_teams); - int league_iter_end = team.league_size() - pool_league_rank_rev * chunks_per_team * chunk_size; - int league_iter_begin = league_iter_end - chunks_per_team * chunk_size; + const int chunks_per_team = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams); + int league_iter_end = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size; + int league_iter_begin = league_iter_end - chunks_per_team * m_chunk_size; if (league_iter_begin < 0) league_iter_begin = 0; if (league_iter_end>team.league_size()) league_iter_end = team.league_size(); @@ -611,7 +588,9 @@ public: m_team_rank = m_team_size - ( m_team_rank_rev + 1 ); m_league_end = league_iter_end ; m_league_rank = league_iter_begin ; - new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem ); + new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] , + ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0], + 0 ); } if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) { @@ -627,10 +606,13 @@ public: void next_static() { - if ( ++m_league_rank < m_league_end ) { + if ( m_league_rank < m_league_end ) { team_barrier(); - new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem ); + new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] , + ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0], + 0); } + m_league_rank++; } bool valid_dynamic() { @@ -661,10 +643,13 @@ public: if(m_invalid_thread) return; - team_barrier(); - if ( ++m_league_rank < m_league_chunk_end ) { - new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem ); + if ( m_league_rank < m_league_chunk_end ) { + team_barrier(); + new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] , + ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0], + 0); } + m_league_rank++; } static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; } @@ -687,8 +672,10 @@ public: m_team_size = p.m_team_size; m_team_alloc = p.m_team_alloc; m_team_iter = p.m_team_iter; - m_team_scratch_size = p.m_team_scratch_size; - m_thread_scratch_size = p.m_thread_scratch_size; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; m_chunk_size = p.m_chunk_size; return *this; } @@ -719,8 +706,8 @@ private: int m_team_alloc ; int m_team_iter ; - size_t m_team_scratch_size; - size_t m_thread_scratch_size; + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; int m_chunk_size; @@ -753,15 +740,19 @@ public: inline int team_size() const { return m_team_size ; } inline int league_size() const { return m_league_size ; } - inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; } + inline size_t scratch_size(const int& level, int team_size_ = -1) const { + if(team_size_ < 0) + team_size_ = m_team_size; + return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ; + } /** \brief Specify league size, request team size */ TeamPolicyInternal( typename traits::execution_space & , int league_size_request , int team_size_request , int /* vector_length_request */ = 1 ) - : m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_chunk_size(0) { init( league_size_request , team_size_request ); } @@ -769,24 +760,24 @@ public: , int league_size_request , const Kokkos::AUTO_t & /* team_size_request */ , int /* vector_length_request */ = 1) - : m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_chunk_size(0) { init( league_size_request , traits::execution_space::thread_pool_size(2) ); } TeamPolicyInternal( int league_size_request , int team_size_request , int /* vector_length_request */ = 1 ) - : m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_chunk_size(0) { init( league_size_request , team_size_request ); } TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & /* team_size_request */ , int /* vector_length_request */ = 1 ) - : m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_chunk_size(0) { init( league_size_request , traits::execution_space::thread_pool_size(2) ); } @@ -803,24 +794,21 @@ public: } inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const { - (void) level; TeamPolicyInternal p = *this; - p.m_team_scratch_size = per_team.value; + p.m_team_scratch_size[level] = per_team.value; return p; }; inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const { - (void) level; TeamPolicyInternal p = *this; - p.m_thread_scratch_size = per_thread.value; + p.m_thread_scratch_size[level] = per_thread.value; return p; }; inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const { - (void) level; TeamPolicyInternal p = *this; - p.m_team_scratch_size = per_team.value; - p.m_thread_scratch_size = per_thread.value; + p.m_team_scratch_size[level] = per_team.value; + p.m_thread_scratch_size[level] = per_thread.value; return p; }; diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp index 92c5b97b9a..3123a297c4 100644 --- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp +++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp @@ -104,7 +104,7 @@ namespace Kokkos { int Qthread::is_initialized() { - Impl::s_number_workers != 0 ; + return Impl::s_number_workers != 0 ; } int Qthread::concurrency() diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp index a1f533b232..f948eb2903 100644 --- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp +++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -113,7 +113,7 @@ public: m_worker_state = QthreadExec::Inactive ; Impl::spinwait( m_worker_state , QthreadExec::Inactive ); } - + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) { m_worker_base[j]->m_worker_state = QthreadExec::Active ; } @@ -136,7 +136,7 @@ public: m_worker_state = QthreadExec::Inactive ; Impl::spinwait( m_worker_state , QthreadExec::Inactive ); } - + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { m_shepherd_base[j]->m_worker_state = QthreadExec::Active ; } @@ -145,11 +145,13 @@ public: //---------------------------------------- /** Reduce across all workers participating in the 'exec_all' */ - template< class FunctorType , class ArgTag > + template< class FunctorType , class ReducerType , class ArgTag > inline - void exec_all_reduce( const FunctorType & func ) const + void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const { - typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ; + typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ; const int rev_rank = m_worker_size - ( m_worker_rank + 1 ); @@ -160,14 +162,14 @@ public: Impl::spinwait( fan.m_worker_state , QthreadExec::Active ); - ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc ); + ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc ); } if ( rev_rank ) { m_worker_state = QthreadExec::Inactive ; Impl::spinwait( m_worker_state , QthreadExec::Inactive ); } - + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) { m_worker_base[j]->m_worker_state = QthreadExec::Active ; } @@ -197,7 +199,7 @@ public: } else { // Root thread scans across values before releasing threads - // Worker data is in reverse order, so m_worker_base[0] is the + // Worker data is in reverse order, so m_worker_base[0] is the // highest ranking thread. // Copy from lower ranking to higher ranking worker. @@ -216,7 +218,7 @@ public: ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc ); } } - + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) { m_worker_base[j]->m_worker_state = QthreadExec::Active ; } @@ -349,7 +351,7 @@ public: } else { // Root thread scans across values before releasing threads - // Worker data is in reverse order, so m_shepherd_base[0] is the + // Worker data is in reverse order, so m_shepherd_base[0] is the // highest ranking thread. // Copy from lower ranking to higher ranking worker. @@ -371,7 +373,7 @@ public: memory_fence(); } - + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { m_shepherd_base[j]->m_worker_state = QthreadExec::Active ; } diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp index 2e3cdce562..5b6419289f 100644 --- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp +++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -130,9 +130,10 @@ public: //---------------------------------------------------------------------------- -template< class FunctorType , class ... Traits > +template< class FunctorType , class ReducerType , class ... Traits > class ParallelReduce< FunctorType , Kokkos::RangePolicy< Traits ... > + , ReducerType , Kokkos::Qthread > { @@ -141,18 +142,24 @@ private: typedef Kokkos::RangePolicy< Traits ... > Policy ; typedef typename Policy::work_tag WorkTag ; - typedef typename Policy::member_type Member ; typedef typename Policy::WorkRange WorkRange ; + typedef typename Policy::member_type Member ; - typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; + typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + // Static Assert WorkTag void if ReducerType not InvalidType + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; - const FunctorType m_functor ; - const Policy m_policy ; - const pointer_type m_result_ptr ; + const FunctorType m_functor ; + const Policy m_policy ; + const ReducerType m_reducer ; + const pointer_type m_result_ptr ; template< class TagType > inline static @@ -187,9 +194,10 @@ private: ParallelReduce::template exec_range< WorkTag >( self.m_functor, range.begin(), range.end(), - ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) ); + ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) + , exec.exec_all_reduce_value() ) ); - exec.template exec_all_reduce<FunctorType, WorkTag >( self.m_functor ); + exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer ); } public: @@ -197,26 +205,39 @@ public: inline void execute() const { - QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 ); + QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 ); Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this ); const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result(); - Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data ); if ( m_result_ptr ) { - const unsigned n = ValueTraits::value_count( m_functor ); + const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; } } } - template< class HostViewType > + template< class ViewType > ParallelReduce( const FunctorType & arg_functor , const Policy & arg_policy - , const HostViewType & arg_result_view ) + , const ViewType & arg_result_view + , typename std::enable_if<Kokkos::is_view< ViewType >::value && + !Kokkos::is_reducer_type< ReducerType >::value + , void*>::type = NULL) : m_functor( arg_functor ) - , m_policy( arg_policy ) - , m_result_ptr( arg_result_view.ptr_on_device() ) + , m_policy( arg_policy ) + , m_reducer( InvalidType() ) + , m_result_ptr( arg_result_view.data() ) + { } + + ParallelReduce( const FunctorType & arg_functor + , Policy arg_policy + , const ReducerType& reducer ) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + , m_reducer( reducer ) + , m_result_ptr( reducer.result_view().data() ) { } }; @@ -291,10 +312,12 @@ public: //---------------------------------------------------------------------------- -template< class FunctorType , class ... Properties > +template< class FunctorType , class ReducerType , class ... Properties > class ParallelReduce< FunctorType , TeamPolicy< Properties... > - , Kokkos::Qthread > + , ReducerType + , Kokkos::Qthread + > { private: @@ -303,14 +326,18 @@ private: typedef typename Policy::work_tag WorkTag ; typedef typename Policy::member_type Member ; - typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; const FunctorType m_functor ; const Policy m_policy ; + const ReducerType m_reducer ; const pointer_type m_result_ptr ; template< class TagType > @@ -345,9 +372,10 @@ private: ParallelReduce::template exec_team< WorkTag > ( self.m_functor , Member( exec , self.m_policy ) - , ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) ); + , ValueInit::init( ReducerConditional::select( self.m_functor , self.m_reducer ) + , exec.exec_all_reduce_value() ) ); - exec.template exec_all_reduce< FunctorType , WorkTag >( self.m_functor ); + exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer ); } public: @@ -356,29 +384,43 @@ public: void execute() const { QthreadExec::resize_worker_scratch - ( /* reduction memory */ ValueTraits::value_size( m_functor ) + ( /* reduction memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) ); Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this ); const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result(); - Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data ); if ( m_result_ptr ) { - const unsigned n = ValueTraits::value_count( m_functor ); + const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; } } } template< class ViewType > - ParallelReduce( const FunctorType & arg_functor , - const Policy & arg_policy , - const ViewType & arg_result ) + ParallelReduce( const FunctorType & arg_functor + , const Policy & arg_policy + , const ViewType & arg_result + , typename std::enable_if<Kokkos::is_view< ViewType >::value && + !Kokkos::is_reducer_type< ReducerType >::value + , void*>::type = NULL) : m_functor( arg_functor ) - , m_policy( arg_policy ) + , m_policy( arg_policy ) + , m_reducer( InvalidType() ) , m_result_ptr( arg_result.ptr_on_device() ) { } + + inline + ParallelReduce( const FunctorType & arg_functor + , Policy arg_policy + , const ReducerType& reducer ) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + , m_reducer( reducer ) + , m_result_ptr( reducer.result_view().data() ) + { } }; //---------------------------------------------------------------------------- @@ -395,8 +437,8 @@ private: typedef Kokkos::RangePolicy< Traits ... > Policy ; typedef typename Policy::work_tag WorkTag ; - typedef typename Policy::member_type Member ; typedef typename Policy::WorkRange WorkRange ; + typedef typename Policy::member_type Member ; typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp index 0765072030..8cc39d277c 100644 --- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp +++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp @@ -58,6 +58,8 @@ #include <Kokkos_Atomic.hpp> #include <Qthread/Kokkos_Qthread_TaskPolicy.hpp> +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + //---------------------------------------------------------------------------- namespace Kokkos { @@ -120,13 +122,13 @@ Task::~TaskMember() } -Task::TaskMember( const function_verify_type arg_verify - , const function_dealloc_type arg_dealloc - , const function_apply_single_type arg_apply_single - , const function_apply_team_type arg_apply_team - , volatile int & arg_active_count - , const unsigned arg_sizeof_derived - , const unsigned arg_dependence_capacity +Task::TaskMember( const function_verify_type arg_verify + , const function_dealloc_type arg_dealloc + , const function_single_type arg_apply_single + , const function_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity ) : m_dealloc( arg_dealloc ) , m_verify( arg_verify ) @@ -144,12 +146,12 @@ Task::TaskMember( const function_verify_type arg_verify for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ; } -Task::TaskMember( const function_dealloc_type arg_dealloc - , const function_apply_single_type arg_apply_single - , const function_apply_team_type arg_apply_team - , volatile int & arg_active_count - , const unsigned arg_sizeof_derived - , const unsigned arg_dependence_capacity +Task::TaskMember( const function_dealloc_type arg_dealloc + , const function_single_type arg_apply_single + , const function_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity ) : m_dealloc( arg_dealloc ) , m_verify( & Task::verify_type<void> ) @@ -316,12 +318,8 @@ aligned_t Task::qthread_func( void * arg ) , int(Kokkos::Experimental::TASK_STATE_EXECUTING) ); - // It is a single thread's responsibility to close out - // this task's execution. - bool close_out = false ; - if ( task->m_apply_team && ! task->m_apply_single ) { - const Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ; + Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ; // Initialize team size and rank with shephered info Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag ); @@ -344,7 +342,7 @@ fflush(stdout); if ( member.team_rank() == 0 ) task->closeout(); member.team_barrier(); } - else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_apply_single_type>(1) ) { + else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) { // Team hard-wired to one, no cloning Kokkos::Impl::QthreadTeamPolicyMember member ; (*task->m_apply_team)( task , member ); @@ -488,5 +486,6 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy ) } // namespace Experimental } // namespace Kokkos +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ #endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */ diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp index 118f13d9f1..22a565503d 100644 --- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp +++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp @@ -69,6 +69,8 @@ #include <impl/Kokkos_FunctorAdapter.hpp> +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + //---------------------------------------------------------------------------- namespace Kokkos { @@ -80,24 +82,24 @@ class TaskMember< Kokkos::Qthread , void , void > { public: - typedef void (* function_apply_single_type) ( TaskMember * ); - typedef void (* function_apply_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & ); - typedef void (* function_dealloc_type)( TaskMember * ); typedef TaskMember * (* function_verify_type) ( TaskMember * ); + typedef void (* function_single_type) ( TaskMember * ); + typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & ); + typedef void (* function_dealloc_type)( TaskMember * ); private: - const function_dealloc_type m_dealloc ; ///< Deallocation - const function_verify_type m_verify ; ///< Result type verification - const function_apply_single_type m_apply_single ; ///< Apply function - const function_apply_team_type m_apply_team ; ///< Apply function - int volatile * const m_active_count ; ///< Count of active tasks on this policy - aligned_t m_qfeb ; ///< Qthread full/empty bit - TaskMember ** const m_dep ; ///< Dependences - const int m_dep_capacity ; ///< Capacity of dependences - int m_dep_size ; ///< Actual count of dependences - int m_ref_count ; ///< Reference count - int m_state ; ///< State of the task + const function_dealloc_type m_dealloc ; ///< Deallocation + const function_verify_type m_verify ; ///< Result type verification + const function_single_type m_apply_single ; ///< Apply function + const function_team_type m_apply_team ; ///< Apply function + int volatile * const m_active_count ; ///< Count of active tasks on this policy + aligned_t m_qfeb ; ///< Qthread full/empty bit + TaskMember ** const m_dep ; ///< Dependences + const int m_dep_capacity ; ///< Capacity of dependences + int m_dep_size ; ///< Actual count of dependences + int m_ref_count ; ///< Reference count + int m_state ; ///< State of the task TaskMember() /* = delete */ ; TaskMember( const TaskMember & ) /* = delete */ ; @@ -128,22 +130,22 @@ protected : ~TaskMember(); // Used by TaskMember< Qthread , ResultType , void > - TaskMember( const function_verify_type arg_verify - , const function_dealloc_type arg_dealloc - , const function_apply_single_type arg_apply_single - , const function_apply_team_type arg_apply_team - , volatile int & arg_active_count - , const unsigned arg_sizeof_derived - , const unsigned arg_dependence_capacity + TaskMember( const function_verify_type arg_verify + , const function_dealloc_type arg_dealloc + , const function_single_type arg_apply_single + , const function_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity ); // Used for TaskMember< Qthread , void , void > - TaskMember( const function_dealloc_type arg_dealloc - , const function_apply_single_type arg_apply_single - , const function_apply_team_type arg_apply_team - , volatile int & arg_active_count - , const unsigned arg_sizeof_derived - , const unsigned arg_dependence_capacity + TaskMember( const function_dealloc_type arg_dealloc + , const function_single_type arg_apply_single + , const function_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity ); public: @@ -221,7 +223,7 @@ public: typedef typename DerivedTaskType::functor_type functor_type ; typedef typename functor_type::value_type value_type ; - const function_apply_single_type flag = reinterpret_cast<function_apply_single_type>( arg_is_team ? 0 : 1 ); + const function_single_type flag = reinterpret_cast<function_single_type>( arg_is_team ? 0 : 1 ); DerivedTaskType * const task = new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) ) @@ -379,16 +381,16 @@ protected: typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ; typedef task_root_type::function_dealloc_type function_dealloc_type ; - typedef task_root_type::function_apply_single_type function_apply_single_type ; - typedef task_root_type::function_apply_team_type function_apply_team_type ; + typedef task_root_type::function_single_type function_single_type ; + typedef task_root_type::function_team_type function_team_type ; inline - TaskMember( const function_dealloc_type arg_dealloc - , const function_apply_single_type arg_apply_single - , const function_apply_team_type arg_apply_team - , volatile int & arg_active_count - , const unsigned arg_sizeof_derived - , const unsigned arg_dependence_capacity + TaskMember( const function_dealloc_type arg_dealloc + , const function_single_type arg_apply_single + , const function_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity ) : task_root_type( & task_root_type::template verify_type< ResultType > , arg_dealloc @@ -413,17 +415,17 @@ public: typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ; typedef TaskMember< Kokkos::Qthread , ResultType , void > task_base_type ; typedef task_root_type::function_dealloc_type function_dealloc_type ; - typedef task_root_type::function_apply_single_type function_apply_single_type ; - typedef task_root_type::function_apply_team_type function_apply_team_type ; + typedef task_root_type::function_single_type function_single_type ; + typedef task_root_type::function_team_type function_team_type ; inline - TaskMember( const function_dealloc_type arg_dealloc - , const function_apply_single_type arg_apply_single - , const function_apply_team_type arg_apply_team - , volatile int & arg_active_count - , const unsigned arg_sizeof_derived - , const unsigned arg_dependence_capacity - , const functor_type & arg_functor + TaskMember( const function_dealloc_type arg_dealloc + , const function_single_type arg_apply_single + , const function_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + , const functor_type & arg_functor ) : task_base_type( arg_dealloc , arg_apply_single @@ -453,6 +455,7 @@ class TaskPolicy< Kokkos::Qthread > public: typedef Kokkos::Qthread execution_space ; + typedef TaskPolicy execution_policy ; typedef Kokkos::Impl::QthreadTeamPolicyMember member_type ; private: @@ -489,14 +492,17 @@ public: , const unsigned arg_task_team_size = 0 /* choose default */ ); - TaskPolicy() = default ; - TaskPolicy( TaskPolicy && rhs ) = default ; - TaskPolicy( const TaskPolicy & rhs ) = default ; - TaskPolicy & operator = ( TaskPolicy && rhs ) = default ; - TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ; + KOKKOS_FUNCTION TaskPolicy() = default ; + KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ; + KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ; + KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ; + KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ; //---------------------------------------- + KOKKOS_INLINE_FUNCTION + int allocated_task_count() const { return m_active_count ; } + template< class ValueType > const Future< ValueType , execution_space > & spawn( const Future< ValueType , execution_space > & f @@ -653,5 +659,6 @@ public: //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ #endif /* #define KOKKOS_QTHREAD_TASK_HPP */ diff --git a/lib/kokkos/core/src/Qthread/README b/lib/kokkos/core/src/Qthread/README index 5d8f29a4ee..6e6c86a9ef 100644 --- a/lib/kokkos/core/src/Qthread/README +++ b/lib/kokkos/core/src/Qthread/README @@ -3,26 +3,23 @@ # Cloning repository and branch: -git clone https://github.com/stelleg/qthreads qthreads-with-clone +git clone git@github.com:Qthreads/qthreads.git qthreads -cd qthreads-with-clone +cd qthreads -# Added to ./git/config -# -# [branch "cloned_tasks"] -# remote = origin -# merge = refs/heads/cloned_tasks -# +# checkout branch with "cloned tasks" -git branch cloned_tasks -git checkout cloned_tasks -git pull +git checkout dev-kokkos + +# Configure/autogen sh autogen.sh -# configurure with 'hwloc' installation: +# configure with 'hwloc' installation: ./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR} +# install +make install diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp index e1d3fe06e6..5f0b8f70cd 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp @@ -53,6 +53,7 @@ #include <Kokkos_Core.hpp> #include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_CPUDiscovery.hpp> +#include <impl/Kokkos_Profiling_Interface.hpp> //---------------------------------------------------------------------------- @@ -134,11 +135,7 @@ void ThreadsExec::driver(void) ThreadsExec::ThreadsExec() : m_pool_base(0) -#if ! KOKKOS_USING_EXP_VIEW - , m_scratch() -#else , m_scratch(0) -#endif , m_scratch_reduce_end(0) , m_scratch_thread_end(0) , m_numa_rank(0) @@ -198,8 +195,6 @@ ThreadsExec::~ThreadsExec() { const unsigned entry = m_pool_size - ( m_pool_rank + 1 ); -#if KOKKOS_USING_EXP_VIEW - typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ; if ( m_scratch ) { @@ -210,12 +205,6 @@ ThreadsExec::~ThreadsExec() Record::decrement( r ); } -#else - - m_scratch.clear(); - -#endif - m_pool_base = 0 ; m_scratch_reduce_end = 0 ; m_scratch_thread_end = 0 ; @@ -439,8 +428,6 @@ void * ThreadsExec::root_reduce_scratch() void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * ) { -#if KOKKOS_USING_EXP_VIEW - typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ; if ( exec.m_scratch ) { @@ -451,19 +438,11 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * ) Record::decrement( r ); } -#else - - exec.m_scratch.clear(); - -#endif - exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ; exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ; if ( s_threads_process.m_scratch_thread_end ) { -#if KOKKOS_USING_EXP_VIEW - // Allocate tracked memory: { Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end ); @@ -475,15 +454,6 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * ) unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch ); -#else - - exec.m_scratch = - HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end ); - - unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() ); - -#endif - unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned); // touch on this thread @@ -520,11 +490,7 @@ void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size ) s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ; } -#if KOKKOS_USING_EXP_VIEW return s_threads_process.m_scratch ; -#else - return s_threads_process.m_scratch.alloc_ptr() ; -#endif } //---------------------------------------------------------------------------- @@ -758,6 +724,9 @@ void ThreadsExec::initialize( unsigned thread_count , // Init the array for used for arbitrarily sized atomics Impl::init_lock_array_host_space(); + #if (KOKKOS_ENABLE_PROFILING) + Kokkos::Profiling::initialize(); + #endif } //---------------------------------------------------------------------------- @@ -807,6 +776,10 @@ void ThreadsExec::finalize() s_threads_process.m_pool_size = 1 ; s_threads_process.m_pool_fan_size = 0 ; s_threads_process.m_pool_state = ThreadsExec::Inactive ; + + #if (KOKKOS_ENABLE_PROFILING) + Kokkos::Profiling::finalize(); + #endif } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp index b2019aaf77..4ec1450d0f 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp @@ -49,7 +49,6 @@ #include <utility> #include <impl/Kokkos_spinwait.hpp> #include <impl/Kokkos_FunctorAdapter.hpp> -#include <impl/Kokkos_AllocationTracker.hpp> #include <Kokkos_Atomic.hpp> @@ -89,11 +88,7 @@ private: ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in -#if ! KOKKOS_USING_EXP_VIEW - Impl::AllocationTracker m_scratch ; -#else void * m_scratch ; -#endif int m_scratch_reduce_end ; int m_scratch_thread_end ; int m_numa_rank ; @@ -138,19 +133,10 @@ public: static int get_thread_count(); static ThreadsExec * get_thread( const int init_thread_rank ); -#if ! KOKKOS_USING_EXP_VIEW - - inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); } - KOKKOS_INLINE_FUNCTION void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; } - -#else - inline void * reduce_memory() const { return m_scratch ; } KOKKOS_INLINE_FUNCTION void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; } -#endif - KOKKOS_INLINE_FUNCTION int volatile & state() { return m_pool_state ; } KOKKOS_INLINE_FUNCTION ThreadsExec * const * pool_base() const { return m_pool_base ; } diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp index b425ac4773..3407ffaa54 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp @@ -129,15 +129,15 @@ public: KOKKOS_INLINE_FUNCTION const execution_space::scratch_memory_space & team_shmem() const - { return m_team_shared.set_team_thread_mode(1,0) ; } + { return m_team_shared.set_team_thread_mode(0,1,0) ; } KOKKOS_INLINE_FUNCTION const execution_space::scratch_memory_space & team_scratch(int) const - { return m_team_shared.set_team_thread_mode(1,0) ; } + { return m_team_shared.set_team_thread_mode(0,1,0) ; } KOKKOS_INLINE_FUNCTION const execution_space::scratch_memory_space & thread_scratch(int) const - { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; } + { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; } KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; } KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; } @@ -433,10 +433,11 @@ public: void next_static() { - if ( ++m_league_rank < m_league_end ) { + if ( m_league_rank < m_league_end ) { team_barrier(); set_team_shared(); } + m_league_rank++; } bool valid_dynamic() { @@ -468,10 +469,11 @@ public: if(m_invalid_thread) return; - team_barrier(); - if ( ++m_league_rank < m_league_chunk_end ) { + if ( m_league_rank < m_league_chunk_end ) { + team_barrier(); set_team_shared(); } + m_league_rank++; } void set_league_shmem( const int arg_league_rank @@ -504,8 +506,8 @@ private: int m_team_alloc ; int m_team_iter ; - size_t m_team_scratch_size; - size_t m_thread_scratch_size; + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; int m_chunk_size; @@ -549,8 +551,10 @@ public: m_team_size = p.m_team_size; m_team_alloc = p.m_team_alloc; m_team_iter = p.m_team_iter; - m_team_scratch_size = p.m_team_scratch_size; - m_thread_scratch_size = p.m_thread_scratch_size; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; m_chunk_size = p.m_chunk_size; return *this; } @@ -577,7 +581,12 @@ public: inline int team_size() const { return m_team_size ; } inline int team_alloc() const { return m_team_alloc ; } inline int league_size() const { return m_league_size ; } - inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; } + inline size_t scratch_size(const int& level, int team_size_ = -1 ) const { + if(team_size_ < 0) + team_size_ = m_team_size; + return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ; + } + inline int team_iter() const { return m_team_iter ; } /** \brief Specify league size, request team size */ @@ -588,8 +597,8 @@ public: : m_league_size(0) , m_team_size(0) , m_team_alloc(0) - , m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + , m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_chunk_size(0) { init(league_size_request,team_size_request); (void) vector_length_request; } @@ -601,8 +610,8 @@ public: : m_league_size(0) , m_team_size(0) , m_team_alloc(0) - , m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + , m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_chunk_size(0) { init(league_size_request,traits::execution_space::thread_pool_size(2)); } @@ -612,8 +621,8 @@ public: : m_league_size(0) , m_team_size(0) , m_team_alloc(0) - , m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + , m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_chunk_size(0) { init(league_size_request,team_size_request); } @@ -623,8 +632,8 @@ public: : m_league_size(0) , m_team_size(0) , m_team_alloc(0) - , m_team_scratch_size ( 0 ) - , m_thread_scratch_size ( 0 ) + , m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } , m_chunk_size(0) { init(league_size_request,traits::execution_space::thread_pool_size(2)); } @@ -639,26 +648,23 @@ public: /** \brief set per team scratch size for a specific level of the scratch hierarchy */ inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const { - (void) level; TeamPolicyInternal p = *this; - p.m_team_scratch_size = per_team.value; + p.m_team_scratch_size[level] = per_team.value; return p; }; /** \brief set per thread scratch size for a specific level of the scratch hierarchy */ inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const { - (void) level; TeamPolicyInternal p = *this; - p.m_thread_scratch_size = per_thread.value; + p.m_thread_scratch_size[level] = per_thread.value; return p; }; /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */ inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const { - (void) level; TeamPolicyInternal p = *this; - p.m_team_scratch_size = per_team.value; - p.m_thread_scratch_size = per_thread.value; + p.m_team_scratch_size[level] = per_team.value; + p.m_thread_scratch_size[level] = per_thread.value; return p; }; diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp index 55ddecf87f..1aba00c94b 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp @@ -264,7 +264,7 @@ public: , const Policy & arg_policy ) : m_functor( arg_functor ) , m_policy( arg_policy ) - , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) + , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) { } }; @@ -272,9 +272,10 @@ public: //---------------------------------------------------------------------------- /* ParallelReduce with Kokkos::Threads and RangePolicy */ -template< class FunctorType , class ... Traits > +template< class FunctorType , class ReducerType, class ... Traits > class ParallelReduce< FunctorType , Kokkos::RangePolicy< Traits ... > + , ReducerType , Kokkos::Threads > { @@ -286,14 +287,18 @@ private: typedef typename Policy::WorkRange WorkRange ; typedef typename Policy::member_type Member ; - typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; const FunctorType m_functor ; const Policy m_policy ; + const ReducerType m_reducer ; const pointer_type m_result_ptr ; template< class TagType > @@ -344,9 +349,9 @@ private: ParallelReduce::template exec_range< WorkTag > ( self.m_functor , range.begin() , range.end() - , ValueInit::init( self.m_functor , exec.reduce_memory() ) ); + , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) ); - exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor ); + exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); } template<class Schedule> @@ -362,7 +367,7 @@ private: exec.barrier(); long work_index = exec.get_work_index(); - reference_type update = ValueInit::init( self.m_functor , exec.reduce_memory() ); + reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ); while(work_index != -1) { const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size(); const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end(); @@ -372,7 +377,7 @@ private: work_index = exec.get_work_index(); } - exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor ); + exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); } public: @@ -380,7 +385,7 @@ public: inline void execute() const { - ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 ); + ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 ); ThreadsExec::start( & ParallelReduce::exec , this ); @@ -391,7 +396,7 @@ public: const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch(); - const unsigned n = ValueTraits::value_count( m_functor ); + const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; } } } @@ -399,9 +404,14 @@ public: template< class HostViewType > ParallelReduce( const FunctorType & arg_functor , const Policy & arg_policy , - const HostViewType & arg_result_view ) + const HostViewType & arg_result_view , + typename std::enable_if< + Kokkos::is_view< HostViewType >::value && + !Kokkos::is_reducer_type<ReducerType>::value + ,void*>::type = NULL) : m_functor( arg_functor ) , m_policy( arg_policy ) + , m_reducer( InvalidType() ) , m_result_ptr( arg_result_view.ptr_on_device() ) { static_assert( Kokkos::is_view< HostViewType >::value @@ -410,14 +420,30 @@ public: static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value , "Kokkos::Threads reduce result must be a View in HostSpace" ); } + + inline + ParallelReduce( const FunctorType & arg_functor + , Policy arg_policy + , const ReducerType& reducer ) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + , m_reducer( reducer ) + , m_result_ptr( reducer.result_view().data() ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } + }; //---------------------------------------------------------------------------- /* ParallelReduce with Kokkos::Threads and TeamPolicy */ -template< class FunctorType , class ... Properties > +template< class FunctorType , class ReducerType, class ... Properties > class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Properties ... > + , ReducerType , Kokkos::Threads > { @@ -426,14 +452,19 @@ private: typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... > Policy ; typedef typename Policy::work_tag WorkTag ; typedef typename Policy::member_type Member ; - typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; + + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; const FunctorType m_functor ; const Policy m_policy ; + const ReducerType m_reducer ; const pointer_type m_result_ptr ; const int m_shared ; @@ -464,9 +495,9 @@ private: ParallelReduce::template exec_team< WorkTag > ( self.m_functor , Member( & exec , self.m_policy , self.m_shared ) - , ValueInit::init( self.m_functor , exec.reduce_memory() ) ); + , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) ); - exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor ); + exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); } public: @@ -474,7 +505,7 @@ public: inline void execute() const { - ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , Policy::member_type::team_reduce_size() + m_shared ); + ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , Policy::member_type::team_reduce_size() + m_shared ); ThreadsExec::start( & ParallelReduce::exec , this ); @@ -484,20 +515,41 @@ public: const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch(); - const unsigned n = ValueTraits::value_count( m_functor ); + const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; } } } template< class ViewType > - ParallelReduce( const FunctorType & arg_functor - , const Policy & arg_policy - , const ViewType & arg_result ) + inline + ParallelReduce( const FunctorType & arg_functor , + const Policy & arg_policy , + const ViewType & arg_result , + typename std::enable_if< + Kokkos::is_view< ViewType >::value && + !Kokkos::is_reducer_type<ReducerType>::value + ,void*>::type = NULL) : m_functor( arg_functor ) - , m_policy( arg_policy ) + , m_policy( arg_policy ) + , m_reducer( InvalidType() ) , m_result_ptr( arg_result.ptr_on_device() ) - , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) - { } + , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) + {} + + inline + ParallelReduce( const FunctorType & arg_functor + , Policy arg_policy + , const ReducerType& reducer ) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + , m_reducer( reducer ) + , m_result_ptr( reducer.result_view().data() ) + , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } }; //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp index 258e683a4f..e1599284b2 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp @@ -46,9 +46,10 @@ #include <stdio.h> #include <iostream> #include <sstream> +#include <Kokkos_Core.hpp> #include <Threads/Kokkos_Threads_TaskPolicy.hpp> -#if defined( KOKKOS_HAVE_PTHREAD ) +#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) #define QLOCK (reinterpret_cast<void*>( ~((uintptr_t)0) )) #define QDENIED (reinterpret_cast<void*>( ~((uintptr_t)0) - 1 )) @@ -87,9 +88,8 @@ ThreadsTaskPolicyQueue::ThreadsTaskPolicyQueue , const unsigned arg_task_team_size ) : m_space( Kokkos::Threads::memory_space() - , arg_task_max_size - , arg_task_max_size * arg_task_max_count - , 1 /* only one level of memory pool */ + , arg_task_max_size * arg_task_max_count * 1.2 + , 16 /* log2(superblock size) */ ) , m_team { 0 , 0 , 0 } , m_serial { 0 , 0 , 0 } @@ -624,10 +624,10 @@ ThreadsTaskPolicyQueue::allocate_task // User created task memory pool with an estimate, // if estimate is to low then report and throw exception. - if ( m_space.get_min_chunk_size() < size_alloc ) { + if ( m_space.get_min_block_size() < size_alloc ) { fprintf(stderr,"TaskPolicy<Threads> task allocation requires %d bytes on memory pool with %d byte chunk size\n" , int(size_alloc) - , int(m_space.get_min_chunk_size()) + , int(m_space.get_min_block_size()) ); fflush(stderr); Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::task_allocate"); @@ -926,5 +926,5 @@ void Task::clear_dependence() } /* namespace Experimental */ } /* namespace Kokkos */ -#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */ +#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */ diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp index a0c28afd0c..116d32e4fc 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp @@ -50,7 +50,7 @@ #include <Kokkos_Threads.hpp> #include <Kokkos_TaskPolicy.hpp> -#if defined( KOKKOS_HAVE_PTHREAD ) +#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) //---------------------------------------------------------------------------- @@ -737,10 +737,9 @@ public: } /* namespace Experimental */ } /* namespace Kokkos */ -#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */ - //---------------------------------------------------------------------------- +#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */ #endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */ diff --git a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp index feb3632d43..1498eafb00 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp @@ -246,8 +246,8 @@ private: enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul }; // The allocation record resides in Host memory space - Record * m_record ; uintptr_t m_record_bits ; + Record * m_record ; public: diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp index 9ace88dfb1..6525fed0a5 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp @@ -47,8 +47,6 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#if KOKKOS_USING_EXP_VIEW - namespace Kokkos { /* For backward compatibility */ @@ -68,8 +66,6 @@ struct ViewAllocateWithoutInitializing { } /* namespace Kokkos */ -#endif - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp index 39339185e7..ed56536cd9 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp @@ -2604,18 +2604,24 @@ class ViewMapping< DstTraits , SrcTraits , && std::is_same< typename DstTraits::specialize , void >::value && - ( - std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value || - std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value || - std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value - ) - && std::is_same< typename SrcTraits::specialize , void >::value && ( - std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value || - std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value || - std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value + std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value + || + ( + ( + std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value || + std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value || + std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value + ) + && + ( + std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value || + std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value || + std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value + ) + ) ) )>::type > { diff --git a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp deleted file mode 100644 index c95557793a..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp +++ /dev/null @@ -1,848 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#include <Kokkos_Core_fwd.hpp> - -#if ! KOKKOS_USING_EXP_VIEW - -#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) - -#include <Kokkos_Atomic.hpp> - -#include <impl/Kokkos_Singleton.hpp> -#include <impl/Kokkos_AllocationTracker.hpp> -#include <impl/Kokkos_Error.hpp> - - -#include <string> -#include <vector> -#include <sstream> -#include <algorithm> -#include <utility> -#include <cstdlib> -#include <cstring> -#include <iostream> -#include <iomanip> - -/* Enable clean up of memory leaks */ -#define CLEAN_UP_MEMORY_LEAKS 0 - -namespace Kokkos { namespace Impl { - -namespace { - - -//----------------------------------------------------------------------------- -// AllocationRecord -//----------------------------------------------------------------------------- -// -// Used to track details about an allocation and provide a ref count -// sizeof(AllocationRecord) == 128 -struct AllocationRecord -{ - enum { - OFFSET = sizeof(AllocatorBase*) // allocator - + sizeof(void*) // alloc_ptr - + sizeof(uint64_t) // alloc_size - + sizeof(AllocatorAttributeBase*) // attribute - + sizeof(uint32_t) // node_index - + sizeof(uint32_t) // ref_count - , LABEL_LENGTH = 128 - OFFSET - }; - - AllocatorBase * const allocator; - void * const alloc_ptr; - const uint64_t alloc_size; - AllocatorAttributeBase * const attribute; - const int32_t node_index; - volatile uint32_t ref_count; - const char label[LABEL_LENGTH]; - - - AllocationRecord( AllocatorBase * const arg_allocator - , void * arg_alloc_ptr - , uint64_t arg_alloc_size - , int32_t arg_node_index - , const std::string & arg_label - ) - : allocator(arg_allocator) - , alloc_ptr(arg_alloc_ptr) - , alloc_size(arg_alloc_size) - , attribute(NULL) - , node_index(arg_node_index) - , ref_count(1) - , label() // zero fill - { - const size_t length = static_cast<size_t>(LABEL_LENGTH-1u) < arg_label.size() ? static_cast<size_t>(LABEL_LENGTH-1u) : arg_label.size(); - strncpy( const_cast<char *>(label), arg_label.c_str(), length ); - } - - ~AllocationRecord() - { - if (attribute) { - delete attribute; - } - } - - uint32_t increment_ref_count() - { - uint32_t old_value = atomic_fetch_add( &ref_count, static_cast<uint32_t>(1) ); - return old_value + 1u; - } - - uint32_t decrement_ref_count() - { - uint32_t old_value = atomic_fetch_sub( &ref_count, static_cast<uint32_t>(1) ); - return old_value - 1u; - } - - void print( std::ostream & oss ) const - { - oss << "{ " << allocator->name() - << " } : \"" << label - << "\" ref_count(" << ref_count - << ") memory[ " << alloc_ptr - << " + " << alloc_size - << " ]" ; - } - - bool set_attribute( AllocatorAttributeBase * attr ) - { - bool result = false; - if (attribute == NULL) { - result = NULL == atomic_compare_exchange( const_cast<AllocatorAttributeBase **>(&attribute) - , reinterpret_cast<AllocatorAttributeBase *>(NULL) - , attr ); - } - - return result; - } - - // disallow copy and assignment - AllocationRecord( const AllocationRecord & ); - AllocationRecord & operator=(const AllocationRecord &); -}; - -template <int NumBlocks> -struct Bitset -{ - enum { blocks = NumBlocks }; - enum { size = blocks * 64 }; - enum { block_mask = 63u }; - enum { block_shift = 6 }; - - // used to find free bits in a bitset - static int count_trailing_zeros(uint64_t x) - { - #if defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG ) || defined( KOKKOS_COMPILER_APPLECC ) - return x ? __builtin_ctzll(x) : 64; - #elif defined( KOKKOS_COMPILER_INTEL ) - enum { shift = 32 }; - enum { mask = (static_cast<uint64_t>(1) << shift) - 1u }; - return (x & mask) ? _bit_scan_forward(static_cast<int>(x & mask)) : - (x >> shift) ? shift + _bit_scan_forward(static_cast<int>(x >> shift)) : - 64 ; - #elif defined( KOKKOS_COMPILER_IBM ) - return x ? __cnttz8(x) : 64; - #else - int i = 0; - for (; ((x & (static_cast<uint64_t>(1) << i)) == 0u) && i < 64; ++i ) {} - return i; - #endif - } - - Bitset() - : m_bits() - { - for (int i=0; i < blocks; ++i) { - m_bits[i] = 0u; - } - } - - bool set( int i ) - { - const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask ); - return !( atomic_fetch_or( m_bits + (i >> block_shift), bit ) & bit ); - } - - bool reset( int i ) - { - const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask ); - return atomic_fetch_and( m_bits + (i >> block_shift), ~bit ) & bit; - } - - bool test( int i ) - { - const uint64_t block = m_bits[ i >> block_shift ]; - const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask ); - return block & bit; - } - - int find_first_unset() const - { - for (int i=0; i < blocks; ++i) { - const uint64_t block = m_bits[i]; - int b = count_trailing_zeros( ~block ); - - if ( b < 64 ) { - return (i << block_shift) + b; - } - } - return size; - } - - volatile uint64_t m_bits[blocks]; -}; - -//----------------------------------------------------------------------------- -// AllocationRecordPool -- singleton class -// -// global_alloc_rec_pool is the ONLY instance of this class -// -//----------------------------------------------------------------------------- -// Record AllocationRecords in a lock-free circular list. -// Each node in the list has a buffer with space for 959 ((15*64)-1) records -// managed by a bitset. Atomics are used to set and reset bits in the bit set. -// The head of the list is atomically updated to the last node found with -// unused space. -// -// Cost time to create an allocation record: amortized O(1), worst case O(num nodes) -// Cost to destroy an allocation recored: O(1) -// -// Singleton allocations are pushed onto a lock-free stack that is destroyed -// after the circular list of allocation records. -struct AllocationRecordPool -{ - enum { BITSET_BLOCKS = 15 }; - - typedef Bitset<BITSET_BLOCKS> bitset_type; - - enum { BUFFER_SIZE = (bitset_type::size - 1) * sizeof(AllocationRecord) }; - - struct AllocationNode - { - AllocationNode() - : next() - , bitset() - , buffer() - { - // set the first bit to used - bitset.set(0); - } - - void * get_buffer( int32_t node_index ) - { - return buffer + (node_index-1) * sizeof(AllocationRecord); - } - - // return 0 if no space is available in the node - int32_t get_node_index() - { - int32_t node_index = 0; - do { - node_index = bitset.find_first_unset(); - - // successfully claimed a bit - if ( node_index != bitset.size && bitset.set(node_index) ) - { - return node_index; - } - } while ( node_index != bitset.size ); - return 0; - } - - void clear_node_index( int32_t node_index ) - { - bitset.reset(node_index); - } - - AllocationNode * next; - bitset_type bitset; - char buffer[BUFFER_SIZE]; - }; - - struct SingletonNode - { - void * buffer; - SingletonNode * next; - Impl::singleton_destroy_function_type destroy; - - SingletonNode( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func ) - : buffer(NULL) - , next(NULL) - , destroy(destroy_func) - { - if (size) { - buffer = malloc(size); - create_func(buffer); - } - } - - ~SingletonNode() - { - if (buffer) { - try { - destroy(buffer); - } catch(...) {} - free(buffer); - } - } - }; - - AllocationRecordPool() - : head( new AllocationNode() ) - , singleton_head(NULL) - { - // setup ring - head->next = head; - } - - ~AllocationRecordPool() - { - // delete allocation records - { - AllocationNode * start = head; - - AllocationNode * curr = start; - - std::vector< std::string > string_vec; - - do { - AllocationNode * next = curr->next; - - #if defined( KOKKOS_DEBUG_PRINT_ALLOCATION_BITSET ) - // print node bitset - for (int i=0; i < bitset_type::blocks; ++i ) { - std::cout << std::hex << std::showbase << curr->bitset.m_bits[i] << " "; - } - std::cout << std::endl; - #endif - - // bit zero does not map to an AllocationRecord - for ( int32_t i=1; i < bitset_type::size; ++i ) - { - if (curr->bitset.test(i)) { - AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) ); - - std::ostringstream oss; - alloc_rec->print( oss ); - string_vec.push_back( oss.str() ); - -#if CLEAN_UP_MEMORY_LEAKS -/* Cleaning up memory leaks prevents memory error detection tools - * from reporting the original source of allocation, which can - * impede debugging with such tools. - */ - try { - destroy(alloc_rec); - } - catch(...) {} -#endif - } - } - - curr->next = NULL; - - delete curr; - - curr = next; - } while ( curr != start ); - - //if ( !string_vec.empty() ) { - // std::sort( string_vec.begin(), string_vec.end() ); - // - // std::ostringstream oss; - // oss << "Error: Allocation pool destroyed with the following memory leak(s):\n"; - // for (size_t i=0; i< string_vec.size(); ++i) - // { - // oss << " " << string_vec[i] << std::endl; - // } - // - // std::cerr << oss.str() << std::endl; - //} - } - - // delete singletons - { - SingletonNode * curr = singleton_head; - - while (curr) { - SingletonNode * next = curr->next; - delete curr; - curr = next; - } - } - } - - AllocationRecord * create( AllocatorBase * arg_allocator - , void * arg_alloc_ptr - , size_t arg_alloc_size - , const std::string & arg_label - ) - { - AllocationNode * start = volatile_load(&head); - - AllocationNode * curr = start; - - - int32_t node_index = curr->get_node_index(); - - if (node_index == 0) { - curr = volatile_load(&curr->next); - } - - while (node_index == 0 && curr != start) - { - node_index = curr->get_node_index(); - if (node_index == 0) { - curr = volatile_load(&curr->next); - } - } - - // Need to allocate and insert a new node - if (node_index == 0 && curr == start) - { - AllocationNode * new_node = new AllocationNode(); - - node_index = new_node->get_node_index(); - - AllocationNode * next = NULL; - do { - next = volatile_load(&curr->next); - new_node->next = next; - memory_fence(); - } while ( next != atomic_compare_exchange( &(curr->next), next, new_node ) ); - - curr = new_node; - } - - void * buffer = curr->get_buffer(node_index); - - // try to set head to curr - if ( start != curr ) - { - atomic_compare_exchange( & head, start, curr ); - } - - return new (buffer) AllocationRecord( arg_allocator - , arg_alloc_ptr - , arg_alloc_size - , node_index - , arg_label - ); - } - - void destroy( AllocationRecord * alloc_rec ) - { - if (alloc_rec) { - const int32_t node_index = alloc_rec->node_index; - AllocationNode * node = get_node( alloc_rec ); - - // deallocate memory - alloc_rec->allocator->deallocate( alloc_rec->alloc_ptr, alloc_rec->alloc_size ); - - // call destructor - alloc_rec->~AllocationRecord(); - - // wait for writes to complete - memory_fence(); - - // clear node index - node->clear_node_index( node_index ); - } - } - - void * create_singleton( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func ) - { - SingletonNode * node = new SingletonNode( size, create_func, destroy_func ); - SingletonNode * next; - - // insert new node at the head of the list - do { - next = volatile_load(&singleton_head); - node->next = next; - } while ( next != atomic_compare_exchange( &singleton_head, next, node ) ); - - return node->buffer; - } - - void print_memory( std::ostream & out ) const - { - AllocationNode * start = head; - - AllocationNode * curr = start; - - std::vector< std::string > string_vec; - - do { - AllocationNode * next = curr->next; - - // bit zero does not map to an AllocationRecord - for ( int32_t i=1; i < bitset_type::size; ++i ) - { - if (curr->bitset.test(i)) { - AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) ); - - std::ostringstream oss; - alloc_rec->print( oss ); - string_vec.push_back( oss.str() ); - } - } - curr = next; - } while ( curr != start ); - - if ( !string_vec.empty() ) { - std::sort( string_vec.begin(), string_vec.end() ); - - std::ostringstream oss; - oss << "Tracked Memory:" << std::endl; - for (size_t i=0; i< string_vec.size(); ++i) - { - oss << " " << string_vec[i] << std::endl; - } - out << oss.str() << std::endl; - } - else { - out << "No Tracked Memory" << std::endl; - } - } - - // find an AllocationRecord such that - // alloc_ptr <= ptr < alloc_ptr + alloc_size - // otherwise return NULL - AllocationRecord * find( void const * ptr, AllocatorBase const * allocator ) const - { - AllocationNode * start = head; - - AllocationNode * curr = start; - - char const * const char_ptr = reinterpret_cast<const char *>(ptr); - - do { - AllocationNode * next = curr->next; - - // bit zero does not map to an AllocationRecord - for ( int32_t i=1; i < bitset_type::size; ++i ) - { - if (curr->bitset.test(i)) { - AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) ); - - char const * const alloc_ptr = reinterpret_cast<char const *>(alloc_rec->alloc_ptr); - - if ( (allocator == alloc_rec->allocator) - && (alloc_ptr <= char_ptr) - && (char_ptr < (alloc_ptr + alloc_rec->alloc_size)) ) - { - return alloc_rec; - } - } - } - curr = next; - } while ( curr != start ); - - return NULL; - } - -private: - - AllocationNode * get_node( AllocationRecord * alloc_rec ) - { - return reinterpret_cast<AllocationNode *>( alloc_rec - alloc_rec->node_index); - } - - AllocationNode * head; - SingletonNode * singleton_head; -}; - -// create the global pool for allocation records -AllocationRecordPool global_alloc_rec_pool; - - - -// convert a uintptr_t to an AllocationRecord pointer -inline -AllocationRecord * to_alloc_rec( uintptr_t alloc_rec ) -{ - return reinterpret_cast<AllocationRecord *>( alloc_rec & ~static_cast<uintptr_t>(1) ); -} - -} // unnamed namespace - -//----------------------------------------------------------------------------- -// Allocation Tracker methods -//----------------------------------------------------------------------------- - -// Create a reference counted AllocationTracker -void AllocationTracker::initalize( AllocatorBase * arg_allocator - , void * arg_alloc_ptr - , size_t arg_alloc_size - , const std::string & arg_label - ) -{ - if ( arg_allocator && arg_alloc_ptr && arg_alloc_size) { - // create record - AllocationRecord * alloc_rec = global_alloc_rec_pool.create( arg_allocator - , arg_alloc_ptr - , arg_alloc_size - , arg_label - ); - - m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT; - } -} - -void AllocationTracker::reallocate( size_t size ) const -{ - AllocationRecord * rec = to_alloc_rec( m_alloc_rec ); - - void * the_alloc_ptr = rec->allocator->reallocate( rec->alloc_ptr, rec->alloc_size, size ); - - if ( NULL != the_alloc_ptr ) - { - *const_cast<void **>(&rec->alloc_ptr) = the_alloc_ptr; - *const_cast<uint64_t *>(&rec->alloc_size) = size; - } - else { - Impl::throw_runtime_exception( "Error: unable to reallocate allocation tracker"); - } -} - - -void AllocationTracker::increment_ref_count() const -{ - to_alloc_rec( m_alloc_rec )->increment_ref_count(); -} - - -void AllocationTracker::decrement_ref_count() const -{ - AllocationRecord * alloc_rec = to_alloc_rec( m_alloc_rec ); - uint32_t the_ref_count = alloc_rec->decrement_ref_count(); - if (the_ref_count == 0u) { - try { - global_alloc_rec_pool.destroy( alloc_rec ); - } - catch(...) {} - } -} - -namespace { - -struct NullAllocator { static const char * name() { return "Null Allocator"; } }; - -} - -AllocatorBase * AllocationTracker::allocator() const -{ - if (m_alloc_rec & REF_COUNT_MASK) { - return to_alloc_rec(m_alloc_rec)->allocator; - } - return Allocator<NullAllocator>::singleton(); -} - -void * AllocationTracker::alloc_ptr() const -{ - if (m_alloc_rec & REF_COUNT_MASK) { - return to_alloc_rec(m_alloc_rec)->alloc_ptr; - } - return NULL; -} - -size_t AllocationTracker::alloc_size() const -{ - if (m_alloc_rec & REF_COUNT_MASK) { - return to_alloc_rec(m_alloc_rec)->alloc_size; - } - return 0u; -} - -size_t AllocationTracker::ref_count() const -{ - if (m_alloc_rec & REF_COUNT_MASK) { - return to_alloc_rec(m_alloc_rec)->ref_count; - } - return 0u; -} - -char const * AllocationTracker::label() const -{ - if (m_alloc_rec & REF_COUNT_MASK) { - return to_alloc_rec(m_alloc_rec)->label; - } - return "[Empty Allocation Tracker]"; -} - -void AllocationTracker::print( std::ostream & oss) const -{ - if (m_alloc_rec & REF_COUNT_MASK) { - to_alloc_rec(m_alloc_rec)->print(oss); - } - else { - oss << label(); - } -} - -bool AllocationTracker::set_attribute( AllocatorAttributeBase * attr ) const -{ - bool result = false; - if (m_alloc_rec & REF_COUNT_MASK) { - result = to_alloc_rec(m_alloc_rec)->set_attribute(attr); - } - return result; -} - -AllocatorAttributeBase * AllocationTracker::attribute() const -{ - if (m_alloc_rec & REF_COUNT_MASK) { - return to_alloc_rec(m_alloc_rec)->attribute; - } - return NULL; -} - -void AllocationTracker::print_tracked_memory( std::ostream & out ) -{ - global_alloc_rec_pool.print_memory( out ); -} - - -AllocationTracker AllocationTracker::find( void const * ptr, AllocatorBase const * arg_allocator ) -{ - AllocationRecord * alloc_rec = global_alloc_rec_pool.find(ptr, arg_allocator); - - AllocationTracker tracker; - - if ( alloc_rec != NULL ) - { - if ( tracking_enabled() ) { - alloc_rec->increment_ref_count(); - tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT; - } - else { - tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec); - } - } - - return tracker ; -} - - - -//----------------------------------------------------------------------------- -// static AllocationTracker -//----------------------------------------------------------------------------- -#if defined( KOKKOS_USE_DECENTRALIZED_HOST ) -namespace { - - // TODO : Detect compiler support for thread local variables - #if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) - bool g_thread_local_tracking_enabled = true; - #pragma omp threadprivate(g_thread_local_tracking_enabled) - #elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) - __thread bool g_thread_local_tracking_enabled = true; - #elif defined( KOKKOS_HAVE_OPENMP ) - bool g_thread_local_tracking_enabled = true; - #pragma omp threadprivate(g_thread_local_tracking_enabled) - #elif defined( KOKKOS_HAVE_PTHREAD ) - __thread bool g_thread_local_tracking_enabled = true; - #elif defined( KOKKOS_HAVE_SERIAL ) - bool g_thread_local_tracking_enabled = true; - #endif -} // unnamed namespace - -void AllocationTracker::disable_tracking() -{ - g_thread_local_tracking_enabled = false; -} - -void AllocationTracker::enable_tracking() -{ - g_thread_local_tracking_enabled = true; -} - -bool AllocationTracker::tracking_enabled() -{ - return g_thread_local_tracking_enabled; -} -#else -namespace { -enum TrackingEnum { TRACKING_ENABLED, TRACKING_DISABLED }; -volatile TrackingEnum g_tracking_enabled = TRACKING_ENABLED; -} - -void AllocationTracker::disable_tracking() -{ - if ( TRACKING_ENABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_ENABLED, TRACKING_DISABLED ) ) { - Impl::throw_runtime_exception("Error: Tracking already disabled"); - } -} - -void AllocationTracker::enable_tracking() -{ - if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) { - Impl::throw_runtime_exception("Error: Tracking already enabled"); - } -} - -bool AllocationTracker::tracking_enabled() -{ - return g_tracking_enabled == TRACKING_ENABLED; -} -#endif - - -//----------------------------------------------------------------------------- -// create singleton free function -//----------------------------------------------------------------------------- -void * create_singleton( size_t size - , Impl::singleton_create_function_type create_func - , Impl::singleton_destroy_function_type destroy_func ) -{ - return global_alloc_rec_pool.create_singleton( size, create_func, destroy_func ); -} - -}} // namespace Kokkos::Impl - -#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */ - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - diff --git a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp deleted file mode 100644 index 738a9d7908..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp +++ /dev/null @@ -1,574 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_ALLOCATION_TRACKER_HPP -#define KOKKOS_ALLOCATION_TRACKER_HPP - -#include <Kokkos_Macros.hpp> - -#if ! KOKKOS_USING_EXP_VIEW - -#include <impl/Kokkos_Traits.hpp> -#include <impl/Kokkos_Error.hpp> - -#include <stdint.h> -#include <cstdlib> -#include <string> -#include <iosfwd> - -namespace Kokkos { namespace Impl { - -//----------------------------------------------------------------------------- -// Create Singleton objects -//----------------------------------------------------------------------------- - -typedef void * (*singleton_create_function_type)(void * buffer); -typedef void (*singleton_destroy_function_type)(void *); - -void * create_singleton( size_t size - , singleton_create_function_type create_func - , singleton_destroy_function_type destroy_func - ); - - - -/// class Singleton -/// -/// Default construct a singleton type. This method is used to circumvent -/// order of construction issues. Singleton objects are destroyed after all -/// other allocations in the reverse order of their creation. -template <typename Type> -class Singleton -{ -public: - /// Get a pointer to the Singleton. Default construct the singleton if it does not already exist - static Type * get() - { - static Type * singleton = NULL; - if (singleton == NULL) { - Impl::singleton_create_function_type create_func = &create; - Impl::singleton_destroy_function_type destroy_func = &destroy; - singleton = reinterpret_cast<Type*>( Impl::create_singleton( sizeof(Type), create_func, destroy_func ) ); - } - return singleton; - } - -private: - - /// Call the Type constructor - static void destroy(void * ptr) - { - reinterpret_cast<Type*>(ptr)->~Type(); - } - - /// placement new the Type in buffer - static void * create(void * buffer) - { - return new (buffer) Type(); - } -}; - - -//----------------------------------------------------------------------------- -// AllocatorBase -//----------------------------------------------------------------------------- - -/// class AllocatorBase -/// -/// Abstract base class for all Allocators. -/// Allocators should be singleton objects, use Singleton<Allocator>::get to create -/// to avoid order of destruction issues -class AllocatorBase -{ -public: - /// name of the allocator - /// used to report memory leaks - virtual const char * name() const = 0; - - /// Allocate a buffer of size number of bytes - virtual void* allocate(size_t size) const = 0; - - /// Deallocate a buffer with size number of bytes - /// The pointer must have been allocated with a call to corresponding allocate - virtual void deallocate(void * ptr, size_t size) const = 0; - - /// Changes the size of the memory block pointed to by ptr. - /// Ptr must have been allocated with the corresponding allocate call - /// The function may move the memory block to a new location - /// (whose address is returned by the function). - /// - /// The content of the memory block is preserved up to the lesser of the new and - /// old sizes, even if the block is moved to a new location. If the new size is larger, - /// the value of the newly allocated portion is indeterminate. - /// - /// In case that ptr is a null pointer, the function behaves like allocate, assigning a - /// new block of size bytes and returning a pointer to its beginning. - virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const = 0; - - /// can a texture object be bound to the allocated memory - virtual bool support_texture_binding() const = 0; - - /// virtual destructor - virtual ~AllocatorBase() {} -}; - -/// class AllocatorAttributeBase -class AllocatorAttributeBase -{ -public: - virtual ~AllocatorAttributeBase() {} -}; - -//----------------------------------------------------------------------------- -// Allocator< StaticAllocator > : public AllocatorBase -//----------------------------------------------------------------------------- - -// HasStaticName -template<typename T> -class HasStaticName -{ - typedef const char * (*static_method)(); - template<typename U, static_method> struct SFINAE {}; - template<typename U> static char Test(SFINAE<U, &U::name>*); - template<typename U> static int Test(...); -public: - enum { value = sizeof(Test<T>(0)) == sizeof(char) }; -}; - - -template <typename T> -inline -typename enable_if<HasStaticName<T>::value, const char *>::type -allocator_name() -{ - return T::name(); -} - -template <typename T> -inline -typename enable_if<!HasStaticName<T>::value, const char *>::type -allocator_name() -{ - return "Unnamed Allocator"; -} - - -// HasStaticAllocate -template<typename T> -class HasStaticAllocate -{ - typedef void * (*static_method)(size_t); - template<typename U, static_method> struct SFINAE {}; - template<typename U> static char Test(SFINAE<U, &U::allocate>*); - template<typename U> static int Test(...); -public: - enum { value = sizeof(Test<T>(0)) == sizeof(char) }; -}; - -template <typename T> -inline -typename enable_if<HasStaticAllocate<T>::value, void *>::type -allocator_allocate(size_t size) -{ - return T::allocate(size); -} - -template <typename T> -inline -typename enable_if<!HasStaticAllocate<T>::value, void *>::type -allocator_allocate(size_t) -{ - throw_runtime_exception( std::string("Error: ") - + std::string(allocator_name<T>()) - + std::string(" cannot allocate memory!") ); - return NULL; -} - -// HasStaticDeallocate -template<typename T> -class HasStaticDeallocate -{ - typedef void (*static_method)(void *, size_t); - template<typename U, static_method> struct SFINAE {}; - template<typename U> static char Test(SFINAE<U, &U::deallocate>*); - template<typename U> static int Test(...); -public: - enum { value = sizeof(Test<T>(0)) == sizeof(char) }; -}; - -template <typename T> -inline -typename enable_if<HasStaticDeallocate<T>::value, void>::type -allocator_deallocate(void * ptr, size_t size) -{ - T::deallocate(ptr,size); -} - -template <typename T> -inline -typename enable_if<!HasStaticDeallocate<T>::value, void>::type -allocator_deallocate(void *, size_t) -{ - throw_runtime_exception( std::string("Error: ") - + std::string(allocator_name<T>()) - + std::string(" cannot deallocate memory!") ); -} - -// HasStaticReallocate -template<typename T> -class HasStaticReallocate -{ - typedef void * (*static_method)(void *, size_t, size_t); - template<typename U, static_method> struct SFINAE {}; - template<typename U> static char Test(SFINAE<U, &U::reallocate>*); - template<typename U> static int Test(...); -public: - enum { value = sizeof(Test<T>(0)) == sizeof(char) }; -}; - -template <typename T> -inline -typename enable_if<HasStaticReallocate<T>::value, void *>::type -allocator_reallocate(void * old_ptr, size_t old_size, size_t new_size) -{ - return T::reallocate(old_ptr, old_size, new_size); -} - -template <typename T> -inline -typename enable_if<!HasStaticReallocate<T>::value, void *>::type -allocator_reallocate(void *, size_t, size_t) -{ - throw_runtime_exception( std::string("Error: ") - + std::string(allocator_name<T>()) - + std::string(" cannot reallocate memory!") ); - return NULL; -} - -// HasStaticReallocate -template<typename T> -class HasStaticSupportTextureBinding -{ - typedef bool (*static_method)(); - template<typename U, static_method> struct SFINAE {}; - template<typename U> static char Test(SFINAE<U, &U::support_texture_binding>*); - template<typename U> static int Test(...); -public: - enum { value = sizeof(Test<T>(0)) == sizeof(char) }; -}; - -template <typename T> -inline -typename enable_if<HasStaticSupportTextureBinding<T>::value, bool>::type -allocator_support_texture_binding() -{ - return T::support_texture_binding(); -} - -template <typename T> -inline -typename enable_if<!HasStaticSupportTextureBinding<T>::value, bool>::type -allocator_support_texture_binding() -{ - return false; -} - -template <typename T> -class Allocator : public AllocatorBase -{ -public: - virtual const char * name() const - { - return allocator_name<T>(); - } - - virtual void* allocate(size_t size) const - { - return allocator_allocate<T>(size); - } - - virtual void deallocate(void * ptr, size_t size) const - { - allocator_deallocate<T>(ptr,size); - } - - virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const - { - return allocator_reallocate<T>(old_ptr, old_size, new_size); - } - - virtual bool support_texture_binding() const - { - return allocator_support_texture_binding<T>(); - } - - static AllocatorBase * singleton() - { - return Singleton< Allocator<T> >::get(); - } -}; - -//----------------------------------------------------------------------------- -// AllocationTracker -//----------------------------------------------------------------------------- - -// forward declaration for friend classes -struct MallocHelper; - -/// class AllocationTracker -/// Will call deallocate from the AllocatorBase when the reference count reaches 0. -/// Reference counting is disabled when the host is in parallel. -class AllocationTracker -{ - // use the least significant bit of the AllocationRecord pointer to indicate if the - // AllocationTracker should reference count - enum { - REF_COUNT_BIT = static_cast<uintptr_t>(1) - , REF_COUNT_MASK = ~static_cast<uintptr_t>(1) - }; - -public: - - /// Find an AllocationTracker such that - /// alloc_ptr <= ptr < alloc_ptr + alloc_size - /// O(n) where n is the number of tracked allocations. - template <typename StaticAllocator> - static AllocationTracker find( void const * ptr ) - { - return find( ptr, Allocator<StaticAllocator>::singleton() ); - } - - - /// Pretty print all the currently tracked memory - static void print_tracked_memory( std::ostream & out ); - - /// Default constructor - KOKKOS_INLINE_FUNCTION - AllocationTracker() - : m_alloc_rec(0) - {} - - /// Create a AllocationTracker - /// - /// Start reference counting the alloc_ptr. - /// When the reference count reachs 0 the allocator deallocate method - /// will be call with the given size. The alloc_ptr should have been - /// allocated with the allocator's allocate method. - /// - /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0 - /// do nothing - template <typename StaticAllocator> - AllocationTracker( StaticAllocator const & - , void * arg_alloc_ptr - , size_t arg_alloc_size - , const std::string & arg_label = std::string("") ) - : m_alloc_rec(0) - { - AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton(); - initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label); - } - - /// Create a AllocationTracker - /// - /// Start reference counting the alloc_ptr. - /// When the reference count reachs 0 the allocator deallocate method - /// will be call with the given size. The alloc_ptr should have been - /// allocated with the allocator's allocate method. - /// - /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0 - /// do nothing - template <typename StaticAllocator> - AllocationTracker( StaticAllocator const & - , size_t arg_alloc_size - , const std::string & arg_label = std::string("") - ) - : m_alloc_rec(0) - { - AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton(); - void * arg_alloc_ptr = arg_allocator->allocate( arg_alloc_size ); - - initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label); - } - - /// Copy an AllocatorTracker - KOKKOS_INLINE_FUNCTION - AllocationTracker( const AllocationTracker & rhs ) - : m_alloc_rec( rhs.m_alloc_rec) - { -#if !defined( __CUDA_ARCH__ ) - if ( rhs.ref_counting() && tracking_enabled() ) { - increment_ref_count(); - } - else { - m_alloc_rec = m_alloc_rec & REF_COUNT_MASK; - } -#else - m_alloc_rec = m_alloc_rec & REF_COUNT_MASK; -#endif - } - - /// Copy an AllocatorTracker - /// Decrement the reference count of the current tracker if necessary - KOKKOS_INLINE_FUNCTION - AllocationTracker & operator=( const AllocationTracker & rhs ) - { - if (this != &rhs) { -#if !defined( __CUDA_ARCH__ ) - if ( ref_counting() ) { - decrement_ref_count(); - } - - m_alloc_rec = rhs.m_alloc_rec; - - if ( rhs.ref_counting() && tracking_enabled() ) { - increment_ref_count(); - } - else { - m_alloc_rec = m_alloc_rec & REF_COUNT_MASK; - } -#else - m_alloc_rec = rhs.m_alloc_rec & REF_COUNT_MASK; -#endif - } - - return * this; - } - - /// Destructor - /// Decrement the reference count if necessary - KOKKOS_INLINE_FUNCTION - ~AllocationTracker() - { -#if !defined( __CUDA_ARCH__ ) - if ( ref_counting() ) { - decrement_ref_count(); - } -#endif - } - - /// Is the tracker valid? - KOKKOS_INLINE_FUNCTION - bool is_valid() const - { - return (m_alloc_rec & REF_COUNT_MASK); - } - - - - /// clear the tracker - KOKKOS_INLINE_FUNCTION - void clear() - { -#if !defined( __CUDA_ARCH__ ) - if ( ref_counting() ) { - decrement_ref_count(); - } -#endif - m_alloc_rec = 0; - } - - /// is this tracker currently counting allocations? - KOKKOS_INLINE_FUNCTION - bool ref_counting() const - { - return (m_alloc_rec & REF_COUNT_BIT); - } - - AllocatorBase * allocator() const; - - /// pointer to the allocated memory - void * alloc_ptr() const; - - /// size in bytes of the allocated memory - size_t alloc_size() const; - - /// the current reference count - size_t ref_count() const; - - /// the label given to the allocation - char const * label() const; - - /// pretty print all the tracker's information to the std::ostream - void print( std::ostream & oss) const; - - - /// set an attribute ptr on the allocation record - /// the arg_attribute pointer will be deleted when the record is destroyed - /// the attribute ptr can only be set once - bool set_attribute( AllocatorAttributeBase * arg_attribute) const; - - /// get the attribute ptr from the allocation record - AllocatorAttributeBase * attribute() const; - - - /// reallocate the memory tracked by this allocation - /// NOT thread-safe - void reallocate( size_t size ) const; - - static void disable_tracking(); - static void enable_tracking(); - static bool tracking_enabled(); - -private: - - static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator ); - - void initalize( AllocatorBase * arg_allocator - , void * arg_alloc_ptr - , size_t arg_alloc_size - , std::string const & label ); - - void increment_ref_count() const; - void decrement_ref_count() const; - - friend struct Impl::MallocHelper; - - uintptr_t m_alloc_rec; -}; - -}} // namespace Kokkos::Impl - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - -#endif //KOKKOS_ALLOCATION_TRACKER_HPP - diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp new file mode 100644 index 0000000000..0246a7b9af --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp @@ -0,0 +1,197 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_ANALYZE_POLICY_HPP +#define KOKKOS_IMPL_ANALYZE_POLICY_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Concepts.hpp> +#include <impl/Kokkos_Tags.hpp> + +namespace Kokkos { namespace Impl { + +template < typename ExecutionSpace = void + , typename Schedule = void + , typename WorkTag = void + , typename IndexType = void + , typename IterationPattern = void + > +struct PolicyTraitsBase +{ + using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>; + + using execution_space = ExecutionSpace; + using schedule_type = Schedule; + using work_tag = WorkTag; + using index_type = IndexType; + using iteration_pattern = IterationPattern; +}; + + +template <typename PolicyBase, typename ExecutionSpace> +struct SetExecutionSpace +{ + static_assert( is_void<typename PolicyBase::execution_space>::value + , "Kokkos Error: More than one execution space given" ); + using type = PolicyTraitsBase< ExecutionSpace + , typename PolicyBase::schedule_type + , typename PolicyBase::work_tag + , typename PolicyBase::index_type + , typename PolicyBase::iteration_pattern + >; +}; + +template <typename PolicyBase, typename Schedule> +struct SetSchedule +{ + static_assert( is_void<typename PolicyBase::schedule_type>::value + , "Kokkos Error: More than one schedule type given" ); + using type = PolicyTraitsBase< typename PolicyBase::execution_space + , Schedule + , typename PolicyBase::work_tag + , typename PolicyBase::index_type + , typename PolicyBase::iteration_pattern + >; +}; + +template <typename PolicyBase, typename WorkTag> +struct SetWorkTag +{ + static_assert( is_void<typename PolicyBase::work_tag>::value + , "Kokkos Error: More than one work tag given" ); + using type = PolicyTraitsBase< typename PolicyBase::execution_space + , typename PolicyBase::schedule_type + , WorkTag + , typename PolicyBase::index_type + , typename PolicyBase::iteration_pattern + >; +}; + +template <typename PolicyBase, typename IndexType> +struct SetIndexType +{ + static_assert( is_void<typename PolicyBase::index_type>::value + , "Kokkos Error: More than one index type given" ); + using type = PolicyTraitsBase< typename PolicyBase::execution_space + , typename PolicyBase::schedule_type + , typename PolicyBase::work_tag + , IndexType + , typename PolicyBase::iteration_pattern + >; +}; + + +template <typename PolicyBase, typename IterationPattern> +struct SetIterationPattern +{ + static_assert( is_void<typename PolicyBase::iteration_pattern>::value + , "Kokkos Error: More than one iteration_pattern given" ); + using type = PolicyTraitsBase< typename PolicyBase::execution_space + , typename PolicyBase::schedule_type + , typename PolicyBase::work_tag + , typename PolicyBase::index_type + , IterationPattern + >; +}; + + +template <typename Base, typename... Traits> +struct AnalyzePolicy; + +template <typename Base, typename T, typename... Traits> +struct AnalyzePolicy<Base, T, Traits...> : public + AnalyzePolicy< + typename std::conditional< is_execution_space<T>::value , SetExecutionSpace<Base,T> + , typename std::conditional< is_schedule_type<T>::value , SetSchedule<Base,T> + , typename std::conditional< is_index_type<T>::value , SetIndexType<Base,T> + , typename std::conditional< std::is_integral<T>::value , SetIndexType<Base, IndexType<T> > + , typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T> + , SetWorkTag<Base,T> + >::type >::type >::type >::type>::type::type + , Traits... + > +{}; + +template <typename Base> +struct AnalyzePolicy<Base> +{ + using execution_space = typename std::conditional< is_void< typename Base::execution_space >::value + , DefaultExecutionSpace + , typename Base::execution_space + >::type; + + using schedule_type = typename std::conditional< is_void< typename Base::schedule_type >::value + , Schedule< Static > + , typename Base::schedule_type + >::type; + + using work_tag = typename Base::work_tag; + + using index_type = typename std::conditional< is_void< typename Base::index_type >::value + , IndexType< typename execution_space::size_type > + , typename Base::index_type + >::type + ::type // nasty hack to make index_type into an integral_type + ; // instead of the wrapped IndexType<T> for backwards compatibility + + using iteration_pattern = typename std::conditional< is_void< typename Base::iteration_pattern >::value + , void // TODO set default iteration pattern + , typename Base::iteration_pattern + >::type; + using type = PolicyTraitsBase< execution_space + , schedule_type + , work_tag + , index_type + , iteration_pattern + >; +}; + +template <typename... Traits> +struct PolicyTraits + : public AnalyzePolicy< PolicyTraitsBase<>, Traits... >::type +{}; + +}} // namespace Kokkos::Impl + + +#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp index 8a27ce6f22..fd7ea845e7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -218,7 +218,17 @@ T atomic_compare_exchange( volatile T * const dest , const T compare , while( !Impl::lock_address_host_space( (void*) dest ) ); T return_val = *dest; if( return_val == compare ) { - const T tmp = *dest = val; + // Don't use the following line of code here: + // + //const T tmp = *dest = val; + // + // Instead, put each assignment in its own statement. This is + // because the overload of T::operator= for volatile *this should + // return void, not volatile T&. See Kokkos #177: + // + // https://github.com/kokkos/kokkos/issues/177 + *dest = val; + const T tmp = *dest; #ifndef KOKKOS_COMPILER_CLANG (void) tmp; #endif @@ -239,7 +249,7 @@ T atomic_compare_exchange( volatile T * const dest, const T compare, const T val { retval = dest[0]; if ( retval == compare ) - dest[0] = val; + dest[0] = val; } return retval; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp index 8990604674..e8cac4ba3b 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -228,7 +228,17 @@ T atomic_exchange( volatile T * const dest , { while( !Impl::lock_address_host_space( (void*) dest ) ); T return_val = *dest; - const T tmp = *dest = val; + // Don't use the following line of code here: + // + //const T tmp = *dest = val; + // + // Instead, put each assignment in its own statement. This is + // because the overload of T::operator= for volatile *this should + // return void, not volatile T&. See Kokkos #177: + // + // https://github.com/kokkos/kokkos/issues/177 + *dest = val; + const T tmp = *dest; #ifndef KOKKOS_COMPILER_CLANG (void) tmp; #endif @@ -305,7 +315,9 @@ void atomic_assign( volatile T * const dest , // member. The volatile return value implicitly defines a // dereference that some compilers (gcc 4.7.2) warn is being ignored. // Suppress warning by casting return to void. - (void)( *dest = val ); + //(void)( *dest = val ); + *dest = val; + Impl::unlock_address_host_space( (void*) dest ); } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp index 239bbf7cbb..62dfcdd2f8 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -93,7 +93,7 @@ T atomic_fetch_add( volatile T * const dest , assume.i = oldval.i ; newval.t = assume.t + val ; oldval.i = atomicCAS( (int*)dest , assume.i , newval.i ); - } while ( assumed.i != oldval.i ); + } while ( assume.i != oldval.i ); return oldval.t ; } @@ -156,9 +156,26 @@ T atomic_fetch_add( volatile T * const dest , #elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL) +#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_USE_ISA_X86_64 ) +KOKKOS_INLINE_FUNCTION +int atomic_fetch_add( volatile int * dest , const int val ) +{ + int original = val; + + __asm__ __volatile__( + "lock xadd %1, %0" + : "+m" (*dest), "+r" (original) + : "m" (*dest), "r" (original) + : "memory" + ); + + return original; +} +#else KOKKOS_INLINE_FUNCTION int atomic_fetch_add( volatile int * const dest , const int val ) -{ return __sync_fetch_and_add(dest,val); } +{ return __sync_fetch_and_add(dest, val); } +#endif KOKKOS_INLINE_FUNCTION long int atomic_fetch_add( volatile long int * const dest , const long int val ) @@ -276,7 +293,17 @@ T atomic_fetch_add( volatile T * const dest , { while( !Impl::lock_address_host_space( (void*) dest ) ); T return_val = *dest; - const T tmp = *dest = return_val + val; + // Don't use the following line of code here: + // + //const T tmp = *dest = return_val + val; + // + // Instead, put each assignment in its own statement. This is + // because the overload of T::operator= for volatile *this should + // return void, not volatile T&. See Kokkos #177: + // + // https://github.com/kokkos/kokkos/issues/177 + *dest = return_val + val; + const T tmp = *dest; (void) tmp; Impl::unlock_address_host_space( (void*) dest ); return return_val; diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp index 647b3ad4e1..a3a57aa81c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp @@ -73,7 +73,7 @@ T atomic_fetch_sub( volatile T * const dest , assume.i = oldval.i ; newval.t = assume.t - val ; oldval.i = atomicCAS( (int*)dest , assume.i , newval.i ); - } while ( assumed.i != oldval.i ); + } while ( assume.i != oldval.i ); return oldval.t ; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp index 76f3ccac73..343e9bf4c4 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp @@ -48,6 +48,22 @@ namespace Kokkos { namespace Impl { +template<class Scalar1, class Scalar2> +struct MaxOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return (val1 > val2 ? val1 : val2); + } +}; + +template<class Scalar1, class Scalar2> +struct MinOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return (val1 < val2 ? val1 : val2); + } +}; + template<class Scalar1, class Scalar2> struct AddOper { KOKKOS_FORCEINLINE_FUNCTION @@ -276,6 +292,18 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest , namespace Kokkos { // Fetch_Oper atomics: return value before operation +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_max(volatile T * const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::MaxOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_min(volatile T * const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::MinOper<T,const T>(),dest,val); +} + template < typename T > KOKKOS_INLINE_FUNCTION T atomic_fetch_mul(volatile T * const dest, const T val) { @@ -326,6 +354,18 @@ T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) { // Oper Fetch atomics: return value after operation +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_max_fetch(volatile T * const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::MaxOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_min_fetch(volatile T * const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::MinOper<T,const T>(),dest,val); +} + template < typename T > KOKKOS_INLINE_FUNCTION T atomic_mul_fetch(volatile T * const dest, const T val) { diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp index 4a2a408273..6e48faa694 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp @@ -425,42 +425,6 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> { typedef int64_t type; }; -#if ! KOKKOS_USING_EXP_VIEW - -class AllocationTracker; - -// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics. -template<class ViewTraits> -class ViewDataHandle< - ViewTraits , - typename enable_if< - ( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) && - ( ViewTraits::memory_traits::Atomic ) - >::type > -{ -private: -// typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) || -// (sizeof(typename ViewTraits::const_value_type)==8), -// int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type -// atomic_view_possible; - typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type; - typedef ViewDataHandle self_type; - -public: - enum { ReturnTypeIsReference = false }; - - typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type; - typedef Impl::AtomicDataElement<ViewTraits> return_type; - - KOKKOS_INLINE_FUNCTION - static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ ) - { - return handle_type(arg_data_ptr); - } -}; - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - }} // namespace Kokkos::Impl #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp deleted file mode 100644 index 7cf233c689..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp +++ /dev/null @@ -1,287 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#include <Kokkos_HostSpace.hpp> - -#if ! KOKKOS_USING_EXP_VIEW - -#include <impl/Kokkos_BasicAllocators.hpp> -#include <impl/Kokkos_Error.hpp> - - -#include <stdint.h> // uintptr_t -#include <cstdlib> // for malloc, realloc, and free -#include <cstring> // for memcpy - -#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE) -#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc -#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES -#endif - -#include <sstream> - -namespace Kokkos { namespace Impl { - -/*--------------------------------------------------------------------------*/ - -void* MallocAllocator::allocate( size_t size ) -{ - void * ptr = NULL; - if (size) { - ptr = malloc(size); - - if (!ptr) - { - std::ostringstream msg ; - msg << name() << ": allocate(" << size << ") FAILED"; - throw_runtime_exception( msg.str() ); - } - } - return ptr; -} - -void MallocAllocator::deallocate( void * ptr, size_t /*size*/ ) -{ - if (ptr) { - free(ptr); - } -} - -void * MallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size) -{ - void * ptr = realloc(old_ptr, new_size); - - if (new_size > 0u && ptr == NULL) { - throw_runtime_exception("Error: Malloc Allocator could not reallocate memory"); - } - return ptr; -} - -/*--------------------------------------------------------------------------*/ - -namespace { - -void * raw_aligned_allocate( size_t size, size_t alignment ) -{ - void * ptr = NULL; - if ( size ) { -#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA ) - ptr = _mm_malloc( size , alignment ); - -#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE) - - posix_memalign( & ptr, alignment , size ); - -#else - // Over-allocate to and round up to guarantee proper alignment. - size_t size_padded = size + alignment + sizeof(void *); - void * alloc_ptr = malloc( size_padded ); - - if (alloc_ptr) { - uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr); - // offset enough to record the alloc_ptr - address += sizeof(void *); - uintptr_t rem = address % alignment; - uintptr_t offset = rem ? (alignment - rem) : 0u; - address += offset; - ptr = reinterpret_cast<void *>(address); - // record the alloc'd pointer - address -= sizeof(void *); - *reinterpret_cast<void **>(address) = alloc_ptr; - } -#endif - } - return ptr; -} - -void raw_aligned_deallocate( void * ptr, size_t /*size*/ ) -{ - if ( ptr ) { -#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA ) - _mm_free( ptr ); - -#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE) - free( ptr ); -#else - // get the alloc'd pointer - void * alloc_ptr = *(reinterpret_cast<void **>(ptr) -1); - free( alloc_ptr ); -#endif - } - -} - -} - -void* AlignedAllocator::allocate( size_t size ) -{ - void * ptr = 0 ; - - if ( size ) { - ptr = raw_aligned_allocate(size, MEMORY_ALIGNMENT); - - if (!ptr) - { - std::ostringstream msg ; - msg << name() << ": allocate(" << size << ") FAILED"; - throw_runtime_exception( msg.str() ); - } - } - return ptr; -} - -void AlignedAllocator::deallocate( void * ptr, size_t size ) -{ - raw_aligned_deallocate( ptr, size); -} - -void * AlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size) -{ - void * ptr = old_ptr;; - - if (old_size < new_size) { - ptr = allocate( new_size ); - - memcpy(ptr, old_ptr, old_size ); - - deallocate( old_ptr, old_size ); - } - - return ptr; -} - -/*--------------------------------------------------------------------------*/ - -// mmap flags for private anonymous memory allocation -#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE ) - #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS) -#elif defined( MAP_ANON) && defined( MAP_PRIVATE ) - #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANON) -#else - #define NO_MMAP -#endif - -// huge page tables -#if !defined( NO_MMAP ) - #if defined( MAP_HUGETLB ) - #define MMAP_FLAGS_HUGE (MMAP_FLAGS | MAP_HUGETLB ) - #elif defined( MMAP_FLAGS ) - #define MMAP_FLAGS_HUGE MMAP_FLAGS - #endif - // threshold to use huge pages - #define MMAP_USE_HUGE_PAGES (1u << 27) -#endif - -// read write access to private memory -#if !defined( NO_MMAP ) - #define MMAP_PROTECTION (PROT_READ | PROT_WRITE) -#endif - - -void* PageAlignedAllocator::allocate( size_t size ) -{ - void *ptr = NULL; - if (size) { -#if !defined NO_MMAP - if ( size < MMAP_USE_HUGE_PAGES ) { - ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS, -1 /*file descriptor*/, 0 /*offset*/); - } else { - ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS_HUGE, -1 /*file descriptor*/, 0 /*offset*/); - } - if (ptr == MAP_FAILED) { - ptr = NULL; - } -#else - static const size_t page_size = 4096; // TODO: read in from sysconf( _SC_PAGE_SIZE ) - - ptr = raw_aligned_allocate( size, page_size); -#endif - if (!ptr) - { - std::ostringstream msg ; - msg << name() << ": allocate(" << size << ") FAILED"; - throw_runtime_exception( msg.str() ); - } - } - return ptr; -} - -void PageAlignedAllocator::deallocate( void * ptr, size_t size ) -{ -#if !defined( NO_MMAP ) - munmap(ptr, size); -#else - raw_aligned_deallocate(ptr, size); -#endif -} - -void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size) -{ - void * ptr = NULL; -#if defined( NO_MMAP ) || defined( __APPLE__ ) || defined( __CYGWIN__ ) - - if (old_size != new_size) { - ptr = allocate( new_size ); - - memcpy(ptr, old_ptr, (old_size < new_size ? old_size : new_size) ); - - deallocate( old_ptr, old_size ); - } - else { - ptr = old_ptr; - } -#else - ptr = mremap( old_ptr, old_size, new_size, MREMAP_MAYMOVE ); - - if (ptr == MAP_FAILED) { - throw_runtime_exception("Error: Page Aligned Allocator could not reallocate memory"); - } -#endif - - return ptr; -} - -}} // namespace Kokkos::Impl - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp new file mode 100644 index 0000000000..0ffbc0548a --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp @@ -0,0 +1,122 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_BITOPS_HPP +#define KOKKOS_BITOPS_HPP + +#include <Kokkos_Macros.hpp> +#include <stdint.h> +#include <climits> + +namespace Kokkos { +namespace Impl { + +KOKKOS_FORCEINLINE_FUNCTION +int bit_scan_forward( unsigned i ) +{ +#if defined( __CUDA_ARCH__ ) + return __ffs(i) - 1; +#elif defined( __GNUC__ ) || defined( __GNUG__ ) + return __builtin_ffs(i) - 1; +#elif defined( __INTEL_COMPILER ) + return _bit_scan_forward(i); +#else + + unsigned t = 1u; + int r = 0; + while ( i && ( i & t == 0 ) ) + { + t = t << 1; + ++r; + } + return r; +#endif +} + +KOKKOS_FORCEINLINE_FUNCTION +int bit_scan_reverse( unsigned i ) +{ + enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) }; +#if defined( __CUDA_ARCH__ ) + return shift - __clz(i); +#elif defined( __GNUC__ ) || defined( __GNUG__ ) + return shift - __builtin_clz(i); +#elif defined( __INTEL_COMPILER ) + return _bit_scan_reverse(i); +#else + unsigned t = 1u << shift; + int r = 0; + while ( i && ( i & t == 0 ) ) + { + t = t >> 1; + ++r; + } + return r; +#endif +} + +/// Count the number of bits set. +KOKKOS_FORCEINLINE_FUNCTION +int bit_count( unsigned i ) +{ +#if defined( __CUDA_ARCH__ ) + return __popc(i); +#elif defined( __GNUC__ ) || defined( __GNUG__ ) + return __builtin_popcount(i); +#elif defined ( __INTEL_COMPILER ) + return _popcnt32(i); +#else + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive + i = i - ( ( i >> 1 ) & ~0u / 3u ); // temp + i = ( i & ~0u / 15u * 3u ) + ( ( i >> 2 ) & ~0u / 15u * 3u ); // temp + i = ( i + ( i >> 4 ) ) & ~0u / 255u * 15u; // temp + + // count + return (int)( ( i * ( ~0u / 255u ) ) >> ( sizeof(unsigned) - 1 ) * CHAR_BIT ); +#endif +} + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_BITOPS_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index 94db15d96f..567a214140 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -148,7 +148,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0); #endif #if (KOKKOS_ENABLE_PROFILING) - Kokkos::Experimental::initialize(); + Kokkos::Profiling::initialize(); #endif } @@ -190,7 +190,7 @@ void finalize_internal( const bool all_spaces = false ) #endif #if (KOKKOS_ENABLE_PROFILING) - Kokkos::Experimental::finalize(); + Kokkos::Profiling::finalize(); #endif } diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp index 43a1b2afbd..78b6794491 100644 --- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp @@ -75,6 +75,7 @@ struct FunctorValueTraits typedef void value_type ; typedef void pointer_type ; typedef void reference_type ; + typedef void functor_type ; enum { StaticValueSize = 0 }; @@ -88,7 +89,10 @@ struct FunctorValueTraits template<class ArgTag> struct FunctorValueTraits<void, ArgTag,false> { - typedef void reference_type; + typedef void value_type ; + typedef void pointer_type ; + typedef void reference_type ; + typedef void functor_type ; }; /** \brief FunctorType::value_type is explicitly declared so use it. @@ -106,6 +110,7 @@ template< class FunctorType , class ArgTag > struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType::value_type */ > { typedef typename Impl::remove_extent< typename FunctorType::value_type >::type value_type ; + typedef FunctorType functor_type; static_assert( 0 == ( sizeof(value_type) % sizeof(int) ) , "Reduction functor's declared value_type requires: 0 == sizeof(value_type) % sizeof(int)" ); @@ -342,6 +347,7 @@ public: typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType >::type value_type ; typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType * >::type pointer_type ; typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType & >::type reference_type ; + typedef FunctorType functor_type; static_assert( IS_VOID || IS_REJECT || 0 == ( sizeof(ValueType) % sizeof(int) ) , "Reduction functor's value_type deduced from functor::operator() requires: 0 == sizeof(value_type) % sizeof(int)" ); @@ -568,24 +574,56 @@ struct FunctorValueJoin ; template< class FunctorType , class ArgTag , class T , class Enable > struct FunctorValueJoin< FunctorType , ArgTag , T & , Enable > { + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType& ){} + KOKKOS_FORCEINLINE_FUNCTION static void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) { *((volatile T*)lhs) += *((const volatile T*)rhs); } + KOKKOS_FORCEINLINE_FUNCTION + void operator()( volatile T& lhs , const volatile T& rhs ) const + { + lhs += rhs; + } + KOKKOS_FORCEINLINE_FUNCTION + void operator() ( T& lhs , const T& rhs ) const + { + lhs += rhs; + } }; /* No 'join' function provided, array of values */ template< class FunctorType , class ArgTag , class T , class Enable > struct FunctorValueJoin< FunctorType , ArgTag , T * , Enable > { + const FunctorType& f; + + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType& f_):f(f_){} + KOKKOS_FORCEINLINE_FUNCTION static - void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) + void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs ) { - const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f); + const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f_); for ( int i = 0 ; i < n ; ++i ) { ((volatile T*)lhs)[i] += ((const volatile T*)rhs)[i]; } } + KOKKOS_FORCEINLINE_FUNCTION + void operator()( volatile T* const lhs , const volatile T* const rhs ) const + { + const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f); + + for ( int i = 0 ; i < n ; ++i ) { lhs[i] += rhs[i]; } + } + KOKKOS_FORCEINLINE_FUNCTION + void operator() ( T* lhs , const T* rhs ) const + { + const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f); + + for ( int i = 0 ; i < n ; ++i ) { lhs[i] += rhs[i]; } + } }; /* 'join' function provided, single value */ @@ -599,10 +637,25 @@ struct FunctorValueJoin , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) ) > { + const FunctorType& f; + + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType& f_):f(f_){} + KOKKOS_FORCEINLINE_FUNCTION static - void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) + void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs ) + { + f_.join( ArgTag() , *((volatile T *)lhs) , *((const volatile T *)rhs) ); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()( volatile T& lhs , const volatile T& rhs ) const + { + f.join( ArgTag() , lhs , rhs ); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator() ( T& lhs , const T& rhs ) const { - f.join( ArgTag() , *((volatile T *)lhs) , *((const volatile T *)rhs) ); + f.join( ArgTag(), lhs , rhs ); } }; @@ -617,10 +670,25 @@ struct FunctorValueJoin , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) ) > { + const FunctorType& f; + + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType& f_):f(f_){} + KOKKOS_FORCEINLINE_FUNCTION static - void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) + void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs ) { - f.join( *((volatile T *)lhs) , *((const volatile T *)rhs) ); + f_.join( *((volatile T *)lhs) , *((const volatile T *)rhs) ); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()( volatile T& lhs , const volatile T& rhs ) const + { + f.join( lhs , rhs ); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator() ( T& lhs , const T& rhs ) const + { + f.join( lhs , rhs ); } }; @@ -635,10 +703,25 @@ struct FunctorValueJoin , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) ) > { + const FunctorType& f; + + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType& f_):f(f_){} + KOKKOS_FORCEINLINE_FUNCTION static - void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) + void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs ) + { + f_.join( ArgTag() , (volatile T *)lhs , (const volatile T *)rhs ); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()( volatile T* const lhs , const volatile T* const rhs ) const { - f.join( ArgTag() , (volatile T *)lhs , (const volatile T *)rhs ); + f.join( ArgTag() , lhs , rhs ); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator() ( T* lhs , const T* rhs ) const + { + f.join( ArgTag(), lhs , rhs ); } }; @@ -653,10 +736,25 @@ struct FunctorValueJoin , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) ) > { + const FunctorType& f; + + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType& f_):f(f_){} + KOKKOS_FORCEINLINE_FUNCTION static - void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) + void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs ) + { + f_.join( (volatile T *)lhs , (const volatile T *)rhs ); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator() ( volatile T* const lhs , const volatile T* const rhs ) const + { + f.join( lhs , rhs ); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator() ( T* lhs , const T* rhs ) const { - f.join( (volatile T *)lhs , (const volatile T *)rhs ); + f.join( lhs , rhs ); } }; diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp index 20956ce593..11cc120212 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp @@ -56,7 +56,6 @@ #include <algorithm> #include <Kokkos_HBWSpace.hpp> -#include <impl/Kokkos_BasicAllocators.hpp> #include <impl/Kokkos_Error.hpp> #include <Kokkos_Atomic.hpp> #ifdef KOKKOS_HAVE_HBWSPACE @@ -126,23 +125,6 @@ int HBWSpace::in_parallel() /*--------------------------------------------------------------------------*/ -#if ! KOKKOS_USING_EXP_VIEW - -namespace Kokkos { -namespace Experimental { - -Kokkos::Impl::AllocationTracker HBWSpace::allocate_and_track( const std::string & label, const size_t size ) -{ - return Kokkos::Impl::AllocationTracker( allocator(), size, label ); -} - -} // namespace Experimental -} // namespace Kokkos - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - -/*--------------------------------------------------------------------------*/ - namespace Kokkos { namespace Experimental { diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp index 9dc774cdeb..b52f4591ef 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp @@ -93,7 +93,6 @@ #include <cstring> #include <Kokkos_HostSpace.hpp> -#include <impl/Kokkos_BasicAllocators.hpp> #include <impl/Kokkos_Error.hpp> #include <Kokkos_Atomic.hpp> @@ -156,21 +155,6 @@ int HostSpace::in_parallel() /*--------------------------------------------------------------------------*/ -#if ! KOKKOS_USING_EXP_VIEW - -namespace Kokkos { - -Impl::AllocationTracker HostSpace::allocate_and_track( const std::string & label, const size_t size ) -{ - return Impl::AllocationTracker( allocator(), size, label ); -} - -} // namespace Kokkos - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - -/*--------------------------------------------------------------------------*/ - namespace Kokkos { /* Default allocation mechanism */ diff --git a/lib/kokkos/core/src/impl/Kokkos_MemoryPool_Inline.hpp b/lib/kokkos/core/src/impl/Kokkos_MemoryPool_Inline.hpp deleted file mode 100644 index bb858d8d9e..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_MemoryPool_Inline.hpp +++ /dev/null @@ -1,446 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_MEMORYPOOL_CPP -#define KOKKOS_MEMORYPOOL_CPP - -// How should errors be handled? In general, production code should return a -// value indicating failure so the user can decide how the error is handled. -// While experimental, code can abort instead. If KOKKOS_MEMPOOLLIST_PRINTERR -// is defined, the code will abort with an error message. Otherwise, the code -// will return with a value indicating failure when possible, or do nothing -// instead. -//#define KOKKOS_MEMPOOLLIST_PRINTERR - -//#define KOKKOS_MEMPOOLLIST_PRINT_INFO - -//---------------------------------------------------------------------------- - -#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) - -/* This '.cpp' is being included by the header file - * to inline these functions for Cuda. - * - * Prefer to implement these functions in a separate - * compilation unit. However, the 'nvcc' linker - * has an internal error when attempting separate compilation - * (--relocatable-device-code=true) - * of Kokkos unit tests. - */ - -#define KOKKOS_MEMPOOLLIST_INLINE inline - -#else - -/* This '.cpp' file is being separately compiled for the Host */ - -#include <Kokkos_MemoryPool.hpp> -#include <Kokkos_Atomic.hpp> - -#define KOKKOS_MEMPOOLLIST_INLINE /* */ - -#endif - -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -#if defined(KOKKOS_MEMPOOLLIST_PRINT_INFO) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) -long MemPoolList::m_count = 0; -#endif - -KOKKOS_FUNCTION -KOKKOS_MEMPOOLLIST_INLINE -uint64_t -MemPoolList::acquire_lock( volatile uint64_t * freelist ) const -{ - uint64_t old_head; - bool locked = false; - - while ( !locked ) { - old_head = *freelist; - - if ( old_head != FREELIST_LOCK_HEAD ) { - // In the initial look at the head, the freelist wasn't locked. - // Attempt to lock the head of list. If the list was changed (including - // being locked) between the initial look and now, head will be different - // than old_head. This means the lock can't proceed and has to be - // tried again. - uint64_t head = - atomic_compare_exchange( freelist, old_head, uint64_t(FREELIST_LOCK_HEAD) ); - - if ( head == old_head ) locked = true; - } - } - - return old_head; -} - -KOKKOS_FUNCTION -KOKKOS_MEMPOOLLIST_INLINE -void -MemPoolList::release_lock( volatile uint64_t * freelist, uint64_t new_head ) const -{ - // This function is only intended to be called if acquire_lock() has already - // been called to acquire a lock on freelist. Thus, we know that the value - // pointed to by freelist is FREELIST_LOCK_HEAD. -#ifdef KOKKOS_MEMPOOLLIST_PRINTERR - uint64_t head = -#endif - atomic_compare_exchange( freelist, uint64_t(FREELIST_LOCK_HEAD), new_head ); - -#ifdef KOKKOS_MEMPOOLLIST_PRINTERR - if ( head != FREELIST_LOCK_HEAD ) { - // We shouldn't get here, but this check is here for sanity. - printf( "\n** MemoryPool::allocate() UNLOCK_ERROR(0x%llx) **\n", - reinterpret_cast<uint64_t>( freelist ) ); -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - fflush( stdout ); -#endif - Kokkos::abort( "" ); - } -#endif -} - -KOKKOS_FUNCTION -KOKKOS_MEMPOOLLIST_INLINE -void * -MemPoolList::refill_freelist( size_t l_exp ) const -{ - void * p = 0; - volatile uint64_t * l_exp_freelist = m_freelist + l_exp; - - // The l_exp freelist is empty. Grab a lock on the freelist. - uint64_t l_exp_old_head = acquire_lock( l_exp_freelist ); - - uint64_t l_exp_old_head_off = get_head_offset( l_exp_old_head ); - - if ( l_exp_old_head_off != FREELIST_END ) { - // Another thread put some more entries on the freelist between when - // this thread saw it empty and acquired the lock. Just return an entry. - uint64_t l_exp_old_head_tag = get_head_tag( l_exp_old_head ); - uint64_t new_head_tag = increment_tag( l_exp_old_head_tag ); - uint64_t new_head_off = *reinterpret_cast<uint64_t *>( m_data + l_exp_old_head_off ); - uint64_t new_head = create_head( new_head_off, new_head_tag ); - - // Release the lock, replacing the head with the next entry on the list. - release_lock( l_exp_freelist, new_head ); - - // Set the chunk to return. - p = m_data + l_exp_old_head_off; - } - else { - // The l_exp freelist is empty. - - size_t l = l_exp + 1; - bool done = false; - - while ( !done ) { - // Find the next freelist that is either locked or not empty. A locked - // freelist will probably have memory available when the lock is - // released. - while ( m_chunk_size[l] > 0 && - get_head_offset( m_freelist[l] ) == FREELIST_END ) ++l; - - if ( m_chunk_size[l] == 0 ) { - // We got to the end of the list of freelists without finding any - // available memory which means the pool is empty. Release the lock - // on the l_exp freelist. - release_lock( l_exp_freelist, l_exp_old_head ); - - // Exit out of the loop. - done = true; - } - else { - volatile uint64_t * l_freelist = m_freelist + l; - - // Grab a lock on the l freelist. - uint64_t l_old_head = acquire_lock( l_freelist ); - uint64_t l_old_head_off = get_head_offset( l_old_head ); - - if ( l_old_head_off != FREELIST_END ) { - // The l freelist has chunks. Grab one to divide. - - // Create a new head for the l_freelist by using the second entry - // in the list and incrementing the current tag. - uint64_t l_old_head_tag = get_head_tag( l_old_head ); - uint64_t new_head_tag = increment_tag( l_old_head_tag ); - uint64_t new_head_off = - *reinterpret_cast<volatile uint64_t *>( m_data + l_old_head_off ); - uint64_t new_head = create_head( new_head_off, new_head_tag ); - - // Release the lock on the l freelist. - release_lock( l_freelist, new_head ); - - // Subdivide the chunk into smaller chunks. The first chunk will - // be returned to satisfy the allocaiton request. The remainder - // of the chunks will be inserted onto the appropriate freelist. - size_t num_chunks = m_chunk_size[l] / m_chunk_size[l_exp]; - - // Link the chunks following the first chunk to form a list. - uint64_t lp_head = l_old_head_off + m_chunk_size[l_exp]; - uint64_t lp_tail = l_old_head_off + (num_chunks - 1) * m_chunk_size[l_exp]; - - for ( uint64_t offset = lp_head; offset < lp_tail; - offset += m_chunk_size[l_exp] ) - { - *reinterpret_cast<uint64_t *>( m_data + offset ) = - offset + m_chunk_size[l_exp]; - } - - // Set the tail entry to be the end of the list. - *reinterpret_cast<volatile uint64_t *>( m_data + lp_tail ) = FREELIST_END; - - memory_fence(); - - // Create a new head for the l_exp_freelist. - new_head = create_head( lp_head, get_head_tag( l_exp_old_head ) ); - - // This thread already has the lock on the l_exp freelist, so just - // release the lock placing the divided memory on the list. - release_lock( l_exp_freelist, new_head ); - - // Set the chunk to return. - p = m_data + l_old_head_off; - done = true; - } - else { - // Release the lock on the l freelist. Put the old head back on. - release_lock( l_freelist, l_old_head ); - } - } - } - } - - return p; -} - -KOKKOS_FUNCTION -KOKKOS_MEMPOOLLIST_INLINE -void * -MemPoolList::allocate( size_t alloc_size ) const -{ - void * p = 0; - - // Find the first freelist whose chunk size is big enough for allocation. - size_t l_exp = 0; - while ( m_chunk_size[l_exp] > 0 && alloc_size > m_chunk_size[l_exp] ) ++l_exp; - -#ifdef KOKKOS_MEMPOOLLIST_PRINTERR - if ( m_chunk_size[l_exp] == 0 ) { - Kokkos::abort( "\n** MemoryPool::allocate() REQUESTED_SIZE_TOO_LARGE **\n" ); - } -#endif - - // Do a fast fail test for an empty list. This checks for l_exp and all - // higher freelists being empty. - size_t l = l_exp; - while ( m_chunk_size[l] > 0 && - get_head_offset( m_freelist[l] ) == FREELIST_END ) ++l; - - if ( m_chunk_size[l] != 0 ) { - // Try to grab a chunk from the l_exp list. - volatile uint64_t * l_exp_freelist = m_freelist + l_exp; - - bool done = false; - - while ( !done ) { - uint64_t old_head = *l_exp_freelist; - uint64_t old_head_off = get_head_offset( old_head ); - - if ( old_head_off == FREELIST_END ) { - // The list is empty. Try to refill it and grab a chunk. - p = refill_freelist(l_exp); - - done = true; - } - else if ( old_head_off != FREELIST_LOCK ) { - // The freelist wasn't empty or locked, so try to pop off the head. - uint64_t old_head_tag = get_head_tag( old_head ); - - // Increment the tag by 1, wrapping around to 0 after 2^32-1. - uint64_t new_head_tag = increment_tag( old_head_tag ); - uint64_t new_head_off = *reinterpret_cast<uint64_t *>( m_data + old_head_off ); - uint64_t new_head = create_head( new_head_off, new_head_tag ); - - // Attempt to pull off the head of the list and put the next entry in - // its place. If the list was changed - // (including being locked) between the initial look and now, head will - // be different than old_head. This means the insert can't proceed and - // has to be tried again. - uint64_t head = atomic_compare_exchange( l_exp_freelist, old_head, new_head ); - - if ( head == old_head ) { - done = true; - p = m_data + old_head_off; - } - } - } - } - -#ifdef KOKKOS_MEMPOOLLIST_PRINT_INFO -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - long val = p == 0 ? - *reinterpret_cast<volatile long *>( &m_count ) : - Kokkos::atomic_fetch_add( &m_count, 1 ); - - printf( " allocate(): %6ld size: %6lu l: %2lu %2lu 0x%llx\n", val, - alloc_size, l_exp, l, reinterpret_cast<uint64_t>( p ) ); - fflush( stdout ); -#else - printf( " allocate() size: %6lu l: %2lu %2lu 0x%lx\n", alloc_size, - l_exp, l, reinterpret_cast<uint64_t>( p ) ); -#endif -#endif - -#ifdef KOKKOS_MEMPOOLLIST_PRINTERR - if ( p == 0 ) { - printf( "** MemoryPool::allocate() NO_CHUNKS_BIG_ENOUGH **\n" ); -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - fflush( stdout ); -#endif - } -#endif - - return p; -} - -KOKKOS_FUNCTION -KOKKOS_MEMPOOLLIST_INLINE -void -MemPoolList::deallocate( void * alloc_ptr, size_t alloc_size ) const -{ -#ifdef KOKKOS_MEMPOOLLIST_PRINTERR - // Verify that the pointer is controlled by this pool. - { - char * ap = static_cast<char *>( alloc_ptr ); - - if ( ap < m_data || ap + alloc_size > m_data + m_data_size ) { - printf( "\n** MemoryPool::deallocate() ADDRESS_OUT_OF_RANGE(0x%llx) **\n", - reinterpret_cast<uint64_t>( alloc_ptr ) ); -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - fflush( stdout ); -#endif - Kokkos::abort( "" ); - } - } -#endif - - // Determine which freelist to place deallocated memory on. - size_t l = 0; - while ( m_chunk_size[l] > 0 && alloc_size > m_chunk_size[l] ) ++l; - -#ifdef KOKKOS_MEMPOOLLIST_PRINTERR - if ( m_chunk_size[l] == 0 ) { - printf( "\n** MemoryPool::deallocate() CHUNK_TOO_LARGE(%lu) **\n", alloc_size ); -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - fflush( stdout ); -#endif - Kokkos::abort( "" ); - } -#endif - - uint64_t offset = static_cast<char *>( alloc_ptr ) - m_data; - - // Insert a single chunk at the head of the freelist. - volatile uint64_t * freelist = m_freelist + l; - - bool inserted = false; - - while ( !inserted ) { - uint64_t old_head = *freelist; - - if ( old_head != FREELIST_LOCK_HEAD ) { - // In the initial look at the head, the freelist wasn't locked. - - uint64_t old_head_off = get_head_offset(old_head); - uint64_t old_head_tag = get_head_tag(old_head); - uint64_t new_head = create_head( offset, old_head_tag ); - - // Proactively point the new head to the old head assuming a successful - // insertion into the list. - *reinterpret_cast<volatile uint64_t *>( alloc_ptr ) = old_head_off; - - memory_fence(); - - // Attempt to insert at head of list. If the list was changed - // (including being locked) between the initial look and now, head will - // be different than old_head. This means the insert can't proceed and - // has to be tried again. - uint64_t head = atomic_compare_exchange( freelist, old_head, new_head ); - - if ( head == old_head ) inserted = true; - } - } - -#ifdef KOKKOS_MEMPOOLLIST_PRINT_INFO -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - long val = Kokkos::atomic_fetch_add( &m_count, -1 ) - 1; - printf( "deallocate(): %6ld size: %6lu l: %2lu 0x%llx\n", val, - alloc_size, l, reinterpret_cast<uint64_t>( alloc_ptr ) ); - fflush( stdout ); -#else - printf( "deallocate() size: %6lu l: %2lu 0x%lx\n", alloc_size, l, - reinterpret_cast<uint64_t>( alloc_ptr ) ); -#endif -#endif -} - - -} // namespace Impl -} // namespace Experimental -} // namespace Kokkos - -#undef KOKKOS_MEMPOOLLIST_INLINE - -#ifdef KOKKOS_MEMPOOLLIST_PRINTERR -#undef KOKKOS_MEMPOOLLIST_PRINTERR -#endif - -#ifdef KOKKOS_MEMPOOLLIST_PRINT_INFO -#undef KOKKOS_MEMPOOLLIST_PRINT_INFO -#endif - -#endif /* #ifndef KOKKOS_MEMORYPOOL_CPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp index 0e87c63e44..556c96d863 100644 --- a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp @@ -58,7 +58,7 @@ struct PhysicalLayout { long long int stride[8]; //distance between two neighboring elements in a given dimension template< class T , class L , class D , class M > - PhysicalLayout( const View<T,L,D,M,ViewDefault> & view ) + PhysicalLayout( const View<T,L,D,M> & view ) : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft >::value ? Left : ( is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error )) , rank( view.Rank ) @@ -66,17 +66,6 @@ struct PhysicalLayout { for(int i=0;i<8;i++) stride[i] = 0; view.stride( stride ); } - #ifdef KOKKOS_HAVE_CUDA - template< class T , class L , class D , class M > - PhysicalLayout( const View<T,L,D,M,ViewCudaTexture> & view ) - : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft >::value ? Left : ( - is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error )) - , rank( view.Rank ) - { - for(int i=0;i<8;i++) stride[i] = 0; - view.stride( stride ); - } - #endif }; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp index 5da60841d4..8ea1e816cd 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp @@ -45,7 +45,7 @@ #define KOKKOSP_DEVICE_INFO_HPP namespace Kokkos { -namespace Experimental { +namespace Profiling { struct KokkosPDeviceInfo { uint32_t deviceID; diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp index f499cc63a7..91faed170a 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp @@ -47,7 +47,7 @@ #include <string.h> namespace Kokkos { - namespace Experimental { + namespace Profiling { bool profileLibraryLoaded() { return (NULL != initProfileLibrary); } @@ -95,6 +95,12 @@ namespace Kokkos { } void initialize() { + + // Make sure initialize calls happens only once + static int is_initialized = 0; + if(is_initialized) return; + is_initialized = 1; + void* firstProfileLibrary; char* envProfileLibrary = getenv("KOKKOS_PROFILE_LIBRARY"); @@ -153,6 +159,11 @@ namespace Kokkos { } void finalize() { + // Make sure finalize calls happens only once + static int is_finalized = 0; + if(is_finalized) return; + is_finalized = 1; + if(NULL != finalizeProfileLibrary) { (*finalizeProfileLibrary)(); diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp index 919c4f619e..4f01256335 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -60,7 +60,7 @@ #if (KOKKOS_ENABLE_PROFILING) namespace Kokkos { - namespace Experimental { + namespace Profiling { typedef void (*initFunction)(const int, const uint64_t, diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp new file mode 100644 index 0000000000..e8bdbde6c6 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp @@ -0,0 +1,147 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY ) + +#include <impl/Kokkos_TaskQueue_impl.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template class TaskQueue< Kokkos::Serial > ; + +void TaskQueueSpecialization< Kokkos::Serial >::execute + ( TaskQueue< Kokkos::Serial > * const queue ) +{ + using execution_space = Kokkos::Serial ; + using queue_type = TaskQueue< execution_space > ; + using task_root_type = TaskBase< execution_space , void , void > ; + using Member = TaskExec< execution_space > ; + + task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + + Member exec ; + + // Loop until all queues are empty + while ( 0 < queue->m_ready_count ) { + + task_root_type * task = end ; + + for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) { + for ( int j = 0 ; j < 2 && end == task ; ++j ) { + task = queue_type::pop_task( & queue->m_ready[i][j] ); + } + } + + if ( end != task ) { + + // pop_task resulted in lock == task->m_next + // In the executing state + + (*task->m_apply)( task , & exec ); + +#if 0 + printf( "TaskQueue<Serial>::executed: 0x%lx { 0x%lx 0x%lx %d %d %d }\n" + , uintptr_t(task) + , uintptr_t(task->m_wait) + , uintptr_t(task->m_next) + , task->m_task_type + , task->m_priority + , task->m_ref_count ); +#endif + + // If a respawn then re-enqueue otherwise the task is complete + // and all tasks waiting on this task are updated. + queue->complete( task ); + } + else if ( 0 != queue->m_ready_count ) { + Kokkos::abort("TaskQueue<Serial>::execute ERROR: ready_count"); + } + } +} + +void TaskQueueSpecialization< Kokkos::Serial > :: + iff_single_thread_recursive_execute( + TaskQueue< Kokkos::Serial > * const queue ) +{ + using execution_space = Kokkos::Serial ; + using queue_type = TaskQueue< execution_space > ; + using task_root_type = TaskBase< execution_space , void , void > ; + using Member = TaskExec< execution_space > ; + + task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + + Member exec ; + + // Loop until no runnable task + + task_root_type * task = end ; + + do { + + task = end ; + + for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) { + for ( int j = 0 ; j < 2 && end == task ; ++j ) { + task = queue_type::pop_task( & queue->m_ready[i][j] ); + } + } + + if ( end == task ) break ; + + (*task->m_apply)( task , & exec ); + + queue->complete( task ); + + } while(1); +} + +}} /* namespace Kokkos::Impl */ + +#endif /* #if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp new file mode 100644 index 0000000000..48a110c5f1 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp @@ -0,0 +1,271 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_SERIAL_TASK_HPP +#define KOKKOS_IMPL_SERIAL_TASK_HPP + +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- + +template<> +class TaskQueueSpecialization< Kokkos::Serial > +{ +public: + + using execution_space = Kokkos::Serial ; + using memory_space = Kokkos::HostSpace ; + using queue_type = Kokkos::Impl::TaskQueue< execution_space > ; + using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ; + + static + void iff_single_thread_recursive_execute( queue_type * const ); + + static + void execute( queue_type * const ); + + template< typename FunctorType > + static + void proc_set_apply( task_base_type::function_type * ptr ) + { + using TaskType = TaskBase< Kokkos::Serial + , typename FunctorType::value_type + , FunctorType + > ; + *ptr = TaskType::apply ; + } +}; + +extern template class TaskQueue< Kokkos::Serial > ; + +//---------------------------------------------------------------------------- + +template<> +class TaskExec< Kokkos::Serial > +{ +public: + + KOKKOS_INLINE_FUNCTION void team_barrier() const {} + KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; } + KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; } +}; + +template<typename iType> +struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > > +{ + typedef iType index_type; + const iType start ; + const iType end ; + enum {increment = 1}; + //const TaskExec< Kokkos::Serial > & thread; + TaskExec< Kokkos::Serial > & thread; + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct + //( const TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count) + ( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count) + : start(0) + , end(arg_count) + , thread(arg_thread) + {} + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct + //( const TaskExec< Kokkos::Serial > & arg_thread + ( TaskExec< Kokkos::Serial > & arg_thread + , const iType& arg_start + , const iType & arg_end + ) + : start( arg_start ) + , end( arg_end) + , thread( arg_thread ) + {} +}; + +}} /* namespace Kokkos::Impl */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +/* +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > > +TeamThreadRange( const Impl::TaskExec< Kokkos::Serial > & thread + , const iType & count ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count); +} +*/ +//TODO const issue omp +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > > +TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread + , const iType & count ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count); +} +/* +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > > +TeamThreadRange( const Impl:: TaskExec< Kokkos::Serial > & thread, const iType & start , const iType & end ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >(thread,start,end); +} +*/ +//TODO const issue omp +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > > +TeamThreadRange( Impl:: TaskExec< Kokkos::Serial > & thread, const iType & start , const iType & end ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >(thread,start,end); +} + + /** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + * This functionality requires C++11 support.*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >& loop_boundaries, const Lambda& lambda) { + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i); +} + +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, + const Lambda & lambda, + ValueType& initialized_result) +{ + + ValueType result = initialized_result; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i, result); + + initialized_result = result; +} + +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, + const Lambda & lambda, + const JoinType & join, + ValueType& initialized_result) +{ + ValueType result = initialized_result; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i, result); + + initialized_result = result; +} +// placeholder for future function +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, + const Lambda & lambda, + ValueType& initialized_result) +{ +} +// placeholder for future function +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, + const Lambda & lambda, + const JoinType & join, + ValueType& initialized_result) +{ +} + +template< typename ValueType, typename iType, class Lambda > +KOKKOS_INLINE_FUNCTION +void parallel_scan + (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, + const Lambda & lambda) +{ + ValueType accum = 0 ; + ValueType val, local_total; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + local_total = 0; + lambda(i,local_total,false); + val = accum; + lambda(i,val,true); + accum += local_total; + } + +} + +// placeholder for future function +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_scan + (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, + const Lambda & lambda) +{ +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ +#endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp index 5f3e65b327..1577df07cd 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp @@ -45,7 +45,8 @@ #include <impl/Kokkos_Serial_TaskPolicy.hpp> -#if defined( KOKKOS_HAVE_SERIAL ) +#if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY ) + #include <stdlib.h> #include <stdexcept> #include <iostream> @@ -252,6 +253,12 @@ void Task::schedule() if ( ok_state && ok_list ) { + if ( TASK_STATE_CONSTRUCTING == m_state ) { + // Initial scheduling increment, + // matched by decrement when task is complete. + ++m_ref_count ; + } + // Will be waiting for execution upon return from this function m_state = Kokkos::Experimental::TASK_STATE_WAITING ; @@ -286,7 +293,8 @@ void Task::execute_ready_tasks() // Task * task ; // while ( ! CAS( & s_ready , task = s_ready , s_ready->m_next ) ); - Task * const task = s_ready ; + Task * task = s_ready ; + s_ready = task->m_next ; task->m_next = 0 ; @@ -325,6 +333,9 @@ void Task::execute_ready_tasks() x = next ; } + + // Decrement to match the initial scheduling increment + assign( & task , 0 ); } } } @@ -333,4 +344,5 @@ void Task::execute_ready_tasks() } // namespace Experimental } // namespace Kokkos -#endif // defined( KOKKOS_HAVE_SERIAL ) +#endif /* #if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp index 3171449c16..a333f948ae 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp @@ -43,10 +43,11 @@ // Experimental unified task-data parallel manycore LDRD -#ifndef KOKKOS_SERIAL_TASKPOLICY_HPP -#define KOKKOS_SERIAL_TASKPOLICY_HPP +#ifndef KOKKOS_EXPERIMENTAL_SERIAL_TASKPOLICY_HPP +#define KOKKOS_EXPERIMENTAL_SERIAL_TASKPOLICY_HPP #include <Kokkos_Macros.hpp> + #if defined( KOKKOS_HAVE_SERIAL ) #include <string> @@ -57,6 +58,8 @@ #include <Kokkos_TaskPolicy.hpp> #include <Kokkos_View.hpp> +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + #include <impl/Kokkos_FunctorAdapter.hpp> //---------------------------------------------------------------------------- @@ -668,6 +671,7 @@ void wait( TaskPolicy< Kokkos::Serial > & ) //---------------------------------------------------------------------------- +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ #endif /* defined( KOKKOS_HAVE_SERIAL ) */ -#endif /* #define KOKKOS_SERIAL_TASK_HPP */ +#endif /* #define KOKKOS_EXPERIMENTAL_SERIAL_TASK_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp index 399b633be9..0bc2864ff1 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -46,93 +46,84 @@ #include <impl/Kokkos_Traits.hpp> #include <Kokkos_Core_fwd.hpp> +#include <type_traits> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template< class C , class Enable = void > -struct is_memory_space_enable -{ typedef std::false_type type ; }; - -template< class C > -struct is_memory_space_enable< C , - typename std::enable_if< - std::is_same< C , typename C::memory_space >::value - >::type > -{ typedef std::true_type type ; }; - - -template< class C , class Enable = void > -struct is_execution_space_enable -{ typedef std::false_type type ; }; - -template< class C > -struct is_execution_space_enable< C , - typename std::enable_if< - std::is_same< C , typename C::execution_space >::value - >::type > -{ typedef std::true_type type ; }; - - -template< class C , class Enable = void > -struct is_execution_policy_enable -{ typedef std::false_type type ; }; +/** KOKKOS_HAVE_TYPE( Type ) + * + * defines a meta-function that check if a type expose an internal typedef or + * type alias which matches Type + * + * e.g. + * KOKKOS_HAVE_TYPE( array_layout ); + * struct Foo { using array_layout = void; }; + * have_array_layout<Foo>::value == 1; + */ +#define KOKKOS_HAVE_TYPE( Type ) \ +template <typename T> \ +struct have_##Type { \ + template <typename U> static std::false_type have_type(...); \ + template <typename U> static std::true_type have_type( typename U::Type* ); \ + using type = decltype(have_type<T>(nullptr)); \ + static constexpr bool value = type::value; \ +} -template< class C > -struct is_execution_policy_enable< C , - typename std::enable_if< - std::is_same< C , typename C::execution_policy >::value - >::type > -{ typedef std::true_type type ; }; +/** KOKKOS_IS_CONCEPT( Concept ) + * + * defines a meta-function that check if a type match the given Kokkos concept + * type alias which matches Type + * + * e.g. + * KOKKOS_IS_CONCEPT( array_layout ); + * struct Foo { using array_layout = Foo; }; + * is_array_layout<Foo>::value == 1; + */ +#define KOKKOS_IS_CONCEPT( Concept ) \ +template <typename T> \ +struct is_##Concept { \ + template <typename U> static std::false_type have_concept(...); \ + template <typename U> static auto have_concept( typename U::Concept* ) \ + ->typename std::is_same<T, typename U::Concept>::type;\ + using type = decltype(have_concept<T>(nullptr)); \ + static constexpr bool value = type::value; \ +} +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- -template< class C , class Enable = void > -struct is_array_layout_enable -{ typedef std::false_type type ; }; +namespace Kokkos { namespace Impl { -template< class C > -struct is_array_layout_enable< C , - typename std::enable_if< - std::is_same< C , typename C::array_layout >::value - >::type > -{ typedef std::true_type type ; }; +template <typename T> +using is_void = std::is_same<void,T>; +// is_memory_space<T>::value +KOKKOS_IS_CONCEPT( memory_space ); -template< class C , class Enable = void > -struct is_memory_traits_enable -{ typedef std::false_type type ; }; +// is_memory_traits<T>::value +KOKKOS_IS_CONCEPT( memory_traits ); -template< class C > -struct is_memory_traits_enable< C , - typename std::enable_if< - std::is_same< C , typename C::memory_traits >::value - >::type > -{ typedef std::true_type type ; }; +// is_execution_space<T>::value +KOKKOS_IS_CONCEPT( execution_space ); +// is_execution_policy<T>::value +KOKKOS_IS_CONCEPT( execution_policy ); -template< class C > -using is_memory_space = typename is_memory_space_enable<C>::type ; +// is_array_layout<T>::value +KOKKOS_IS_CONCEPT( array_layout ); -template< class C > -using is_execution_space = typename is_execution_space_enable<C>::type ; +// is_iteration_pattern<T>::value +KOKKOS_IS_CONCEPT( iteration_pattern ); -template< class C > -using is_execution_policy = typename is_execution_policy_enable<C>::type ; +// is_schedule_type<T>::value +KOKKOS_IS_CONCEPT( schedule_type ); -template< class C > -using is_array_layout = typename is_array_layout_enable<C>::type ; +// is_index_type<T>::value +KOKKOS_IS_CONCEPT( index_type ); -template< class C > -using is_memory_traits = typename is_memory_traits_enable<C>::type ; +}} // namespace Kokkos::Impl -} -} //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp new file mode 100644 index 0000000000..663bb1985d --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp @@ -0,0 +1,499 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_TASKQUEUE_HPP +#define KOKKOS_IMPL_TASKQUEUE_HPP + +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + +#include <string> +#include <typeinfo> +#include <stdexcept> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template< typename > class TaskPolicy ; + +template< typename Arg1 = void , typename Arg2 = void > class Future ; + +} /* namespace Kokkos */ + +namespace Kokkos { +namespace Impl { + +template< typename , typename , typename > class TaskBase ; +template< typename > class TaskExec ; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< typename Space > +class TaskQueueSpecialization ; + +/** \brief Manage task allocation, deallocation, and scheduling. + * + * Task execution is deferred to the TaskQueueSpecialization. + * All other aspects of task management have shared implementation. + */ +template< typename ExecSpace > +class TaskQueue { +private: + + friend class TaskQueueSpecialization< ExecSpace > ; + friend class Kokkos::TaskPolicy< ExecSpace > ; + + using execution_space = ExecSpace ; + using specialization = TaskQueueSpecialization< execution_space > ; + using memory_space = typename specialization::memory_space ; + using device_type = Kokkos::Device< execution_space , memory_space > ; + using memory_pool = Kokkos::Experimental::MemoryPool< device_type > ; + using task_root_type = Kokkos::Impl::TaskBase<execution_space,void,void> ; + + struct Destroy { + TaskQueue * m_queue ; + void destroy_shared_allocation(); + }; + + //---------------------------------------- + + enum : int { NumQueue = 3 }; + + // Queue is organized as [ priority ][ type ] + + memory_pool m_memory ; + task_root_type * volatile m_ready[ NumQueue ][ 2 ]; + long m_accum_alloc ; // Accumulated number of allocations + int m_count_alloc ; // Current number of allocations + int m_max_alloc ; // Maximum number of allocations + int m_ready_count ; // Number of ready or executing + + //---------------------------------------- + + ~TaskQueue(); + TaskQueue() = delete ; + TaskQueue( TaskQueue && ) = delete ; + TaskQueue( TaskQueue const & ) = delete ; + TaskQueue & operator = ( TaskQueue && ) = delete ; + TaskQueue & operator = ( TaskQueue const & ) = delete ; + + TaskQueue + ( const memory_space & arg_space + , unsigned const arg_memory_pool_capacity + , unsigned const arg_memory_pool_superblock_capacity_log2 + ); + + // Schedule a task + // Precondition: + // task is not executing + // task->m_next is the dependence or zero + // Postcondition: + // task->m_next is linked list membership + KOKKOS_FUNCTION + void schedule( task_root_type * const ); + + // Complete a task + // Precondition: + // task is not executing + // task->m_next == LockTag => task is complete + // task->m_next != LockTag => task is respawn + // Postcondition: + // task->m_wait == LockTag => task is complete + // task->m_wait != LockTag => task is waiting + KOKKOS_FUNCTION + void complete( task_root_type * ); + + KOKKOS_FUNCTION + static bool push_task( task_root_type * volatile * const + , task_root_type * const ); + + KOKKOS_FUNCTION + static task_root_type * pop_task( task_root_type * volatile * const ); + + KOKKOS_FUNCTION static + void decrement( task_root_type * task ); + +public: + + // If and only if the execution space is a single thread + // then execute ready tasks. + KOKKOS_INLINE_FUNCTION + void iff_single_thread_recursive_execute() + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + specialization::iff_single_thread_recursive_execute( this ); +#endif + } + + void execute() { specialization::execute( this ); } + + // Assign task pointer with reference counting of assigned tasks + template< typename LV , typename RV > + KOKKOS_FUNCTION static + void assign( TaskBase< execution_space,LV,void> ** const lhs + , TaskBase< execution_space,RV,void> * const rhs ) + { + using task_lhs = TaskBase< execution_space,LV,void> ; +#if 0 + { + printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n" + , uintptr_t( lhs ? *lhs : 0 ) + , uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 ) + , int( lhs && *lhs ? (*lhs)->m_task_type : 0 ) + , int( lhs && *lhs ? (*lhs)->m_ref_count : 0 ) + , uintptr_t(rhs) + , uintptr_t( rhs ? rhs->m_next : 0 ) + , int( rhs ? rhs->m_task_type : 0 ) + , int( rhs ? rhs->m_ref_count : 0 ) + ); + fflush( stdout ); + } +#endif + + if ( *lhs ) decrement( *lhs ); + if ( rhs ) { Kokkos::atomic_fetch_add( &(rhs->m_ref_count) , 1 ); } + + // Force write of *lhs + + *static_cast< task_lhs * volatile * >(lhs) = rhs ; + + Kokkos::memory_fence(); + } + + KOKKOS_FUNCTION + size_t allocate_block_size( size_t n ); ///< Actual block size allocated + + KOKKOS_FUNCTION + void * allocate( size_t n ); ///< Allocate from the memory pool + + KOKKOS_FUNCTION + void deallocate( void * p , size_t n ); ///< Deallocate to the memory pool +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template<> +class TaskBase< void , void , void > { +public: + enum : int16_t { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 }; + enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) }; +}; + +/** \brief Base class for task management, access, and execution. + * + * Inheritance structure to allow static_cast from the task root type + * and a task's FunctorType. + * + * // Enable a Future to access result data + * TaskBase< Space , ResultType , void > + * : TaskBase< void , void , void > + * { ... }; + * + * // Enable a functor to access the base class + * TaskBase< Space , ResultType , FunctorType > + * : TaskBase< Space , ResultType , void > + * , FunctorType + * { ... }; + * + * + * States of a task: + * + * Constructing State, NOT IN a linked list + * m_wait == 0 + * m_next == 0 + * + * Scheduling transition : Constructing -> Waiting + * before: + * m_wait == 0 + * m_next == this task's initial dependence, 0 if none + * after: + * m_wait == EndTag + * m_next == EndTag + * + * Waiting State, IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == next of linked list of tasks + * + * transition : Waiting -> Executing + * before: + * m_next == EndTag + * after:: + * m_next == LockTag + * + * Executing State, NOT IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == LockTag + * + * Respawn transition : Executing -> Executing-Respawn + * before: + * m_next == LockTag + * after: + * m_next == this task's updated dependence, 0 if none + * + * Executing-Respawn State, NOT IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == this task's updated dependence, 0 if none + * + * transition : Executing -> Complete + * before: + * m_wait == head of linked list + * after: + * m_wait == LockTag + * + * Complete State, NOT IN a linked list + * m_wait == LockTag: cannot add dependence + * m_next == LockTag: not a member of a wait queue + * + */ +template< typename ExecSpace > +class TaskBase< ExecSpace , void , void > +{ +public: + + enum : int16_t { TaskTeam = TaskBase<void,void,void>::TaskTeam + , TaskSingle = TaskBase<void,void,void>::TaskSingle + , Aggregate = TaskBase<void,void,void>::Aggregate }; + + enum : uintptr_t { LockTag = TaskBase<void,void,void>::LockTag + , EndTag = TaskBase<void,void,void>::EndTag }; + + using execution_space = ExecSpace ; + using queue_type = TaskQueue< execution_space > ; + + template< typename > friend class Kokkos::TaskPolicy ; + + typedef void (* function_type) ( TaskBase * , void * ); + + // sizeof(TaskBase) == 48 + + function_type m_apply ; ///< Apply function pointer + queue_type * m_queue ; ///< Queue in which this task resides + TaskBase * m_wait ; ///< Linked list of tasks waiting on this + TaskBase * m_next ; ///< Waiting linked-list next + int32_t m_ref_count ; ///< Reference count + int32_t m_alloc_size ;///< Allocation size + int32_t m_dep_count ; ///< Aggregate's number of dependences + int16_t m_task_type ; ///< Type of task + int16_t m_priority ; ///< Priority of runnable task + + TaskBase( TaskBase && ) = delete ; + TaskBase( const TaskBase & ) = delete ; + TaskBase & operator = ( TaskBase && ) = delete ; + TaskBase & operator = ( const TaskBase & ) = delete ; + + KOKKOS_INLINE_FUNCTION ~TaskBase() = default ; + + KOKKOS_INLINE_FUNCTION + constexpr TaskBase() noexcept + : m_apply(0) + , m_queue(0) + , m_wait(0) + , m_next(0) + , m_ref_count(0) + , m_alloc_size(0) + , m_dep_count(0) + , m_task_type( TaskSingle ) + , m_priority( 1 /* TaskRegularPriority */ ) + {} + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + TaskBase ** aggregate_dependences() + { return reinterpret_cast<TaskBase**>( this + 1 ); } + + using get_return_type = void ; + + KOKKOS_INLINE_FUNCTION + get_return_type get() const {} +}; + +template < typename ExecSpace , typename ResultType > +class TaskBase< ExecSpace , ResultType , void > + : public TaskBase< ExecSpace , void , void > +{ +private: + + static_assert( sizeof(TaskBase<ExecSpace,void,void>) == 48 , "" ); + + TaskBase( TaskBase && ) = delete ; + TaskBase( const TaskBase & ) = delete ; + TaskBase & operator = ( TaskBase && ) = delete ; + TaskBase & operator = ( const TaskBase & ) = delete ; + +public: + + ResultType m_result ; + + KOKKOS_INLINE_FUNCTION ~TaskBase() = default ; + + KOKKOS_INLINE_FUNCTION + TaskBase() + : TaskBase< ExecSpace , void , void >() + , m_result() + {} + + using get_return_type = ResultType const & ; + + KOKKOS_INLINE_FUNCTION + get_return_type get() const { return m_result ; } +}; + + +template< typename ExecSpace , typename ResultType , typename FunctorType > +class TaskBase + : public TaskBase< ExecSpace , ResultType , void > + , public FunctorType +{ +private: + + TaskBase() = delete ; + TaskBase( TaskBase && ) = delete ; + TaskBase( const TaskBase & ) = delete ; + TaskBase & operator = ( TaskBase && ) = delete ; + TaskBase & operator = ( const TaskBase & ) = delete ; + +public: + + using root_type = TaskBase< ExecSpace , void , void > ; + using base_type = TaskBase< ExecSpace , ResultType , void > ; + using member_type = TaskExec< ExecSpace > ; + using functor_type = FunctorType ; + using result_type = ResultType ; + + template< typename Type > + KOKKOS_INLINE_FUNCTION static + void apply_functor + ( Type * const task + , typename std::enable_if + < std::is_same< typename Type::result_type , void >::value + , member_type * const + >::type member + ) + { + using fType = typename Type::functor_type ; + static_cast<fType*>(task)->operator()( *member ); + } + + template< typename Type > + KOKKOS_INLINE_FUNCTION static + void apply_functor + ( Type * const task + , typename std::enable_if + < ! std::is_same< typename Type::result_type , void >::value + , member_type * const + >::type member + ) + { + using fType = typename Type::functor_type ; + static_cast<fType*>(task)->operator()( *member , task->m_result ); + } + + KOKKOS_FUNCTION static + void apply( root_type * root , void * exec ) + { + TaskBase * const lock = reinterpret_cast< TaskBase * >( root_type::LockTag ); + TaskBase * const task = static_cast< TaskBase * >( root ); + member_type * const member = reinterpret_cast< member_type * >( exec ); + + TaskBase::template apply_functor( task , member ); + + // Task may be serial or team. + // If team then must synchronize before querying task->m_next. + // If team then only one thread calls destructor. + + member->team_barrier(); + + if ( 0 == member->team_rank() && lock == task->m_next ) { + // Did not respawn, destroy the functor to free memory + static_cast<functor_type*>(task)->~functor_type(); + // Cannot destroy the task until its dependences + // have been processed. + } + } + + KOKKOS_INLINE_FUNCTION + TaskBase( FunctorType const & arg_functor ) + : base_type() + , FunctorType( arg_functor ) + {} + + KOKKOS_INLINE_FUNCTION + ~TaskBase() {} +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ +#endif /* #ifndef KOKKOS_IMPL_TASKQUEUE_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp new file mode 100644 index 0000000000..70a880d4a2 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp @@ -0,0 +1,569 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- + +template< typename ExecSpace > +void TaskQueue< ExecSpace >::Destroy::destroy_shared_allocation() +{ + m_queue->~TaskQueue(); +} + +//---------------------------------------------------------------------------- + +template< typename ExecSpace > +TaskQueue< ExecSpace >::TaskQueue + ( const TaskQueue< ExecSpace >::memory_space & arg_space + , unsigned const arg_memory_pool_capacity + , unsigned const arg_memory_pool_superblock_capacity_log2 + ) + : m_memory( arg_space + , arg_memory_pool_capacity + , arg_memory_pool_superblock_capacity_log2 ) + , m_ready() + , m_accum_alloc(0) + , m_max_alloc(0) + , m_ready_count(0) +{ + for ( int i = 0 ; i < NumQueue ; ++i ) { + m_ready[i][0] = (task_root_type *) task_root_type::EndTag ; + m_ready[i][1] = (task_root_type *) task_root_type::EndTag ; + } +} + +//---------------------------------------------------------------------------- + +template< typename ExecSpace > +TaskQueue< ExecSpace >::~TaskQueue() +{ + // Verify that queues are empty and ready count is zero + + for ( int i = 0 ; i < NumQueue ; ++i ) { + for ( int j = 0 ; j < 2 ; ++j ) { + if ( m_ready[i][j] != (task_root_type *) task_root_type::EndTag ) { + Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready tasks"); + } + } + } + + if ( 0 != m_ready_count ) { + Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready or executing tasks"); + } +} + +//---------------------------------------------------------------------------- + +template< typename ExecSpace > +KOKKOS_FUNCTION +void TaskQueue< ExecSpace >::decrement + ( TaskQueue< ExecSpace >::task_root_type * task ) +{ + const int count = Kokkos::atomic_fetch_add(&(task->m_ref_count),-1); + +#if 0 + if ( 1 == count ) { + printf( "decrement-destroy( 0x%lx { 0x%lx %d %d } )\n" + , uintptr_t( task ) + , uintptr_t( task->m_next ) + , int( task->m_task_type ) + , int( task->m_ref_count ) + ); + } +#endif + + if ( ( 1 == count ) && + ( task->m_next == (task_root_type *) task_root_type::LockTag ) ) { + // Reference count is zero and task is complete, deallocate. + task->m_queue->deallocate( task , task->m_alloc_size ); + } + else if ( count <= 1 ) { + Kokkos::abort("TaskPolicy task has negative reference count or is incomplete" ); + } +} + +//---------------------------------------------------------------------------- + +template< typename ExecSpace > +KOKKOS_FUNCTION +size_t TaskQueue< ExecSpace >::allocate_block_size( size_t n ) +{ + return m_memory.allocate_block_size( n ); +} + +template< typename ExecSpace > +KOKKOS_FUNCTION +void * TaskQueue< ExecSpace >::allocate( size_t n ) +{ + void * const p = m_memory.allocate(n); + + if ( p ) { + Kokkos::atomic_increment( & m_accum_alloc ); + Kokkos::atomic_increment( & m_count_alloc ); + + if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ; + } + + return p ; +} + +template< typename ExecSpace > +KOKKOS_FUNCTION +void TaskQueue< ExecSpace >::deallocate( void * p , size_t n ) +{ + m_memory.deallocate( p , n ); + Kokkos::atomic_decrement( & m_count_alloc ); +} + +//---------------------------------------------------------------------------- + +template< typename ExecSpace > +KOKKOS_FUNCTION +bool TaskQueue< ExecSpace >::push_task + ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue + , TaskQueue< ExecSpace >::task_root_type * const task + ) +{ + // Push task into a concurrently pushed and popped queue. + // The queue is a linked list where 'task->m_next' form the links. + // Fail the push attempt if the queue is locked; + // otherwise retry until the push succeeds. + +#if 0 + printf( "push_task( 0x%lx { 0x%lx } 0x%lx { 0x%lx 0x%lx %d %d %d } )\n" + , uintptr_t(queue) + , uintptr_t(*queue) + , uintptr_t(task) + , uintptr_t(task->m_wait) + , uintptr_t(task->m_next) + , task->m_task_type + , task->m_priority + , task->m_ref_count ); +#endif + + task_root_type * const zero = (task_root_type *) 0 ; + task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; + + task_root_type * volatile * const next = & task->m_next ; + + if ( zero != *next ) { + Kokkos::abort("TaskQueue::push_task ERROR: already a member of another queue" ); + } + + task_root_type * y = *queue ; + + while ( lock != y ) { + + *next = y ; + + // Do not proceed until '*next' has been stored. + Kokkos::memory_fence(); + + task_root_type * const x = y ; + + y = Kokkos::atomic_compare_exchange(queue,y,task); + + if ( x == y ) return true ; + } + + // Failed, replace 'task->m_next' value since 'task' remains + // not a member of a queue. + + *next = zero ; + + // Do not proceed until '*next' has been stored. + Kokkos::memory_fence(); + + return false ; +} + +//---------------------------------------------------------------------------- + +template< typename ExecSpace > +KOKKOS_FUNCTION +typename TaskQueue< ExecSpace >::task_root_type * +TaskQueue< ExecSpace >::pop_task + ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue ) +{ + // Pop task from a concurrently pushed and popped queue. + // The queue is a linked list where 'task->m_next' form the links. + + task_root_type * const zero = (task_root_type *) 0 ; + task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; + task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + + // *queue is + // end => an empty queue + // lock => a locked queue + // valid + + // Retry until the lock is acquired or the queue is empty. + + task_root_type * task = *queue ; + + while ( end != task ) { + + // The only possible values for the queue are + // (1) lock, (2) end, or (3) a valid task. + // Thus zero will never appear in the queue. + // + // If queue is locked then just read by guaranteeing + // the CAS will fail. + + if ( lock == task ) task = 0 ; + + task_root_type * const x = task ; + + task = Kokkos::atomic_compare_exchange(queue,task,lock); + + if ( x == task ) break ; // CAS succeeded and queue is locked + } + + if ( end != task ) { + + // This thread has locked the queue and removed 'task' from the queue. + // Extract the next entry of the queue from 'task->m_next' + // and mark 'task' as popped from a queue by setting + // 'task->m_next = lock'. + + task_root_type * const next = + Kokkos::atomic_exchange( & task->m_next , lock ); + + // Place the next entry in the head of the queue, + // which also unlocks the queue. + + task_root_type * const unlock = + Kokkos::atomic_exchange( queue , next ); + + if ( next == zero || next == lock || lock != unlock ) { + Kokkos::abort("TaskQueue::pop_task ERROR"); + } + } + +#if 0 + if ( end != task ) { + printf( "pop_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n" + , uintptr_t(queue) + , uintptr_t(task) + , uintptr_t(task->m_wait) + , uintptr_t(task->m_next) + , int(task->m_task_type) + , int(task->m_priority) + , int(task->m_ref_count) ); + } +#endif + + return task ; +} + +//---------------------------------------------------------------------------- + +template< typename ExecSpace > +KOKKOS_FUNCTION +void TaskQueue< ExecSpace >::schedule + ( TaskQueue< ExecSpace >::task_root_type * const task ) +{ + // Schedule a runnable or when_all task upon construction / spawn + // and upon completion of other tasks that 'task' is waiting on. + + // Precondition on runnable task state: + // task is either constructing or executing + // + // Constructing state: + // task->m_wait == 0 + // task->m_next == dependence + // Executing-respawn state: + // task->m_wait == head of linked list + // task->m_next == dependence + // + // Task state transition: + // Constructing -> Waiting + // Executing-respawn -> Waiting + // + // Postcondition on task state: + // task->m_wait == head of linked list + // task->m_next == member of linked list + +#if 0 + printf( "schedule( 0x%lx { 0x%lx 0x%lx %d %d %d }\n" + , uintptr_t(task) + , uintptr_t(task->m_wait) + , uintptr_t(task->m_next) + , task->m_task_type + , task->m_priority + , task->m_ref_count ); +#endif + + task_root_type * const zero = (task_root_type *) 0 ; + task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; + task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + + //---------------------------------------- + { + // If Constructing then task->m_wait == 0 + // Change to waiting by task->m_wait = EndTag + + task_root_type * const init = + Kokkos::atomic_compare_exchange( & task->m_wait , zero , end ); + + // Precondition + + if ( lock == init ) { + Kokkos::abort("TaskQueue::schedule ERROR: task is complete"); + } + + // if ( init == 0 ) Constructing -> Waiting + // else Executing-Respawn -> Waiting + } + //---------------------------------------- + + if ( task_root_type::Aggregate != task->m_task_type ) { + + // Scheduling a runnable task which may have a depencency 'dep'. + // Extract dependence, if any, from task->m_next. + // If 'dep' is not null then attempt to push 'task' + // into the wait queue of 'dep'. + // If the push succeeds then 'task' may be + // processed or executed by another thread at any time. + // If the push fails then 'dep' is complete and 'task' + // is ready to execute. + + task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero ); + + const bool is_ready = + ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) ); + + // Reference count for dep was incremented when assigned + // to task->m_next so that if it completed prior to the + // above push_task dep would not be destroyed. + // dep reference count can now be decremented, + // which may deallocate the task. + TaskQueue::assign( & dep , (task_root_type *)0 ); + + if ( is_ready ) { + + // No dependence or 'dep' is complete so push task into ready queue. + // Increment the ready count before pushing into ready queue + // to track number of ready + executing tasks. + // The ready count will be decremented when the task is complete. + + Kokkos::atomic_increment( & m_ready_count ); + + task_root_type * volatile * const queue = + & m_ready[ task->m_priority ][ task->m_task_type ]; + + // A push_task fails if the ready queue is locked. + // A ready queue is only locked during a push or pop; + // i.e., it is never permanently locked. + // Retry push to ready queue until it succeeds. + // When the push succeeds then 'task' may be + // processed or executed by another thread at any time. + + while ( ! push_task( queue , task ) ); + } + } + //---------------------------------------- + else { + // Scheduling a 'when_all' task with multiple dependences. + // This scheduling may be called when the 'when_all' is + // (1) created or + // (2) being removed from a completed task's wait list. + + task_root_type ** const aggr = task->aggregate_dependences(); + + // Assume the 'when_all' is complete until a dependence is + // found that is not complete. + + bool is_complete = true ; + + for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) { + + --i ; + + // Loop dependences looking for an incomplete task. + // Add this task to the incomplete task's wait queue. + + // Remove a task 'x' from the dependence list. + // The reference count of 'x' was incremented when + // it was assigned into the dependence list. + + task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero ); + + if ( x ) { + + // If x->m_wait is not locked then push succeeds + // and the aggregate is not complete. + // If the push succeeds then this when_all 'task' may be + // processed by another thread at any time. + // For example, 'x' may be completeed by another + // thread and then re-schedule this when_all 'task'. + + is_complete = ! push_task( & x->m_wait , task ); + + // Decrement reference count which had been incremented + // when 'x' was added to the dependence list. + + TaskQueue::assign( & x , zero ); + } + } + + if ( is_complete ) { + // The when_all 'task' was not added to a wait queue because + // all dependences were complete so this aggregate is complete. + // Complete the when_all 'task' to schedule other tasks + // that are waiting for the when_all 'task' to complete. + + task->m_next = lock ; + + complete( task ); + + // '*task' may have been deleted upon completion + } + } + //---------------------------------------- + // Postcondition: + // A runnable 'task' was pushed into a wait or ready queue. + // An aggregate 'task' was either pushed to a wait queue + // or completed. + // Concurrent execution may have already popped 'task' + // from a queue and processed it as appropriate. +} + +//---------------------------------------------------------------------------- + +template< typename ExecSpace > +KOKKOS_FUNCTION +void TaskQueue< ExecSpace >::complete + ( TaskQueue< ExecSpace >::task_root_type * task ) +{ + // Complete a runnable task that has finished executing + // or a when_all task when all of its dependeneces are complete. + + task_root_type * const zero = (task_root_type *) 0 ; + task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; + task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + +#if 0 + printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n" + , uintptr_t(task) + , uintptr_t(task->m_wait) + , uintptr_t(task->m_next) + , task->m_task_type + , task->m_priority + , task->m_ref_count ); + fflush( stdout ); +#endif + + const bool runnable = task_root_type::Aggregate != task->m_task_type ; + + //---------------------------------------- + + if ( runnable && lock != task->m_next ) { + // Is a runnable task has finished executing and requested respawn. + // Schedule the task for subsequent execution. + + schedule( task ); + } + //---------------------------------------- + else { + // Is either an aggregate or a runnable task that executed + // and did not respawn. Transition this task to complete. + + // If 'task' is an aggregate then any of the runnable tasks that + // it depends upon may be attempting to complete this 'task'. + // Must only transition a task once to complete status. + // This is controled by atomically locking the wait queue. + + // Stop other tasks from adding themselves to this task's wait queue + // by locking the head of this task's wait queue. + + task_root_type * x = Kokkos::atomic_exchange( & task->m_wait , lock ); + + if ( x != (task_root_type *) lock ) { + + // This thread has transitioned this 'task' to complete. + // 'task' is no longer in a queue and is not executing + // so decrement the reference count from 'task's creation. + // If no other references to this 'task' then it will be deleted. + + TaskQueue::assign( & task , zero ); + + // This thread has exclusive access to the wait list so + // the concurrency-safe pop_task function is not needed. + // Schedule the tasks that have been waiting on the input 'task', + // which may have been deleted. + + while ( x != end ) { + + // Set x->m_next = zero <= no dependence + + task_root_type * const next = + (task_root_type *) Kokkos::atomic_exchange( & x->m_next , zero ); + + schedule( x ); + + x = next ; + } + } + } + + if ( runnable ) { + // A runnable task was popped from a ready queue and executed. + // If respawned into a ready queue then the ready count was incremented + // so decrement whether respawned or not. + Kokkos::atomic_decrement( & m_ready_count ); + } +} + +//---------------------------------------------------------------------------- + +} /* namespace Impl */ +} /* namespace Kokkos */ + + +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp index 80a326f080..1f14e42874 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp @@ -109,6 +109,9 @@ public: }; } // namespace Impl + + using Kokkos::Impl::Timer ; + } // namespace Kokkos #endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp index b9e07a82de..278f715bc9 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp @@ -47,6 +47,7 @@ #include <stddef.h> #include <stdint.h> #include <Kokkos_Macros.hpp> +#include <string> #include <type_traits> namespace Kokkos { @@ -357,9 +358,31 @@ struct is_integral : public integral_constant< bool , std::is_same< T , uint64_t >::value )> {}; - //---------------------------------------------------------------------------- +template<typename T> +struct is_label : public false_type {}; + +template<> +struct is_label<const char*> : public true_type {}; + +template<> +struct is_label<char*> : public true_type {}; + + +template<int N> +struct is_label<const char[N]> : public true_type {}; + +template<int N> +struct is_label<char[N]> : public true_type {}; + + +template<> +struct is_label<const std::string> : public true_type {}; + +template<> +struct is_label<std::string> : public true_type {}; + // These 'constexpr'functions can be used as // both regular functions and meta-function. diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp index 452af66cde..8b63039f57 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp @@ -115,135 +115,6 @@ template< class ExecSpace , class Type , bool Initialize > struct ViewDefaultConstruct { ViewDefaultConstruct( Type * , size_t ) {} }; -#if ! KOKKOS_USING_EXP_VIEW - -/** \brief ViewDataHandle provides the type of the 'data handle' which the view - * uses to access data with the [] operator. It also provides - * an allocate function and a function to extract a raw ptr from the - * data handle. ViewDataHandle also defines an enum ReferenceAble which - * specifies whether references/pointers to elements can be taken and a - * 'return_type' which is what the view operators will give back. - * Specialisation of this object allows three things depending - * on ViewTraits and compiler options: - * (i) Use special allocator (e.g. huge pages/small pages and pinned memory) - * (ii) Use special data handle type (e.g. add Cuda Texture Object) - * (iii) Use special access intrinsics (e.g. texture fetch and non-caching loads) - */ -template< class StaticViewTraits , class Enable = void > -struct ViewDataHandle { - - enum { ReturnTypeIsReference = true }; - - typedef typename StaticViewTraits::value_type * handle_type; - typedef typename StaticViewTraits::value_type & return_type; - - KOKKOS_INLINE_FUNCTION - static handle_type create_handle( typename StaticViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ ) - { - return handle_type(arg_data_ptr); - } -}; - -template< class StaticViewTraits , class Enable = void > -class ViewDataManagement : public ViewDataHandle< StaticViewTraits > { -private: - - template< class , class > friend class ViewDataManagement ; - - struct PotentiallyManaged {}; - struct StaticallyUnmanaged {}; - - /* Statically unmanaged if traits or not executing in host-accessible memory space */ - typedef typename - Impl::if_c< StaticViewTraits::is_managed && - Impl::is_same< Kokkos::HostSpace - , Kokkos::Impl::ActiveExecutionMemorySpace >::value - , PotentiallyManaged - , StaticallyUnmanaged - >::type StaticManagementTag ; - - enum { Unmanaged = 0x01 - , Noncontiguous = 0x02 - }; - - enum { DefaultTraits = Impl::is_same< StaticManagementTag , StaticallyUnmanaged >::value ? Unmanaged : 0 }; - - unsigned m_traits ; ///< Runtime traits - - - template< class T > - inline static - unsigned assign( const ViewDataManagement<T> & rhs , const PotentiallyManaged & ) - { return rhs.m_traits | ( rhs.is_managed() && Kokkos::HostSpace::in_parallel() ? unsigned(Unmanaged) : 0u ); } - - template< class T > - KOKKOS_INLINE_FUNCTION static - unsigned assign( const ViewDataManagement<T> & rhs , const StaticallyUnmanaged & ) - { return rhs.m_traits | Unmanaged ; } - -public: - - typedef typename ViewDataHandle< StaticViewTraits >::handle_type handle_type; - - KOKKOS_INLINE_FUNCTION - ViewDataManagement() : m_traits( DefaultTraits ) {} - - KOKKOS_INLINE_FUNCTION - ViewDataManagement( const ViewDataManagement & rhs ) - : m_traits( assign( rhs , StaticManagementTag() ) ) {} - - KOKKOS_INLINE_FUNCTION - ViewDataManagement & operator = ( const ViewDataManagement & rhs ) - { m_traits = assign( rhs , StaticManagementTag() ); return *this ; } - - template< class SVT > - KOKKOS_INLINE_FUNCTION - ViewDataManagement( const ViewDataManagement<SVT> & rhs ) - : m_traits( assign( rhs , StaticManagementTag() ) ) {} - - template< class SVT > - KOKKOS_INLINE_FUNCTION - ViewDataManagement & operator = ( const ViewDataManagement<SVT> & rhs ) - { m_traits = assign( rhs , StaticManagementTag() ); return *this ; } - - KOKKOS_INLINE_FUNCTION - bool is_managed() const { return ! ( m_traits & Unmanaged ); } - - KOKKOS_INLINE_FUNCTION - bool is_contiguous() const { return ! ( m_traits & Noncontiguous ); } - - KOKKOS_INLINE_FUNCTION - void set_unmanaged() { m_traits |= Unmanaged ; } - - KOKKOS_INLINE_FUNCTION - void set_noncontiguous() { m_traits |= Noncontiguous ; } - - template< bool Initialize > - static - handle_type allocate( const std::string & label - , const Impl::ViewOffset< typename StaticViewTraits::shape_type, typename StaticViewTraits::array_layout > & offset_map - , AllocationTracker & tracker - ) - { - typedef typename StaticViewTraits::execution_space execution_space ; - typedef typename StaticViewTraits::memory_space memory_space ; - typedef typename StaticViewTraits::value_type value_type ; - - const size_t count = offset_map.capacity(); - - tracker = memory_space::allocate_and_track( label, sizeof(value_type) * count ); - - value_type * ptr = reinterpret_cast<value_type *>(tracker.alloc_ptr()); - - // Default construct within the view's execution space. - (void) ViewDefaultConstruct< execution_space , value_type , Initialize >( ptr , count ); - - return ViewDataHandle< StaticViewTraits >::create_handle(ptr, tracker); - } -}; - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp index 0fc3e22b9e..61d2e35702 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp @@ -46,164 +46,11 @@ #include <impl/KokkosExp_ViewTile.hpp> -#if KOKKOS_USING_EXP_VIEW - namespace Kokkos { using Kokkos::Experimental::tile_subview ; } -#else - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template< class T , unsigned N0 , unsigned N1 , class MemorySpace , class MemoryTraits > -struct ViewSpecialize< T , void , LayoutTileLeft<N0,N1> , MemorySpace , MemoryTraits > -{ - typedef ViewDefault type ; -}; - -struct ViewTile {}; - -template< class ShapeType , unsigned N0 , unsigned N1 > -struct ViewOffset< ShapeType - , LayoutTileLeft<N0,N1,true> /* Only accept properly shaped tiles */ - , typename Impl::enable_if<( 2 == ShapeType::rank - && - 2 == ShapeType::rank_dynamic - )>::type > - : public ShapeType -{ - enum { SHIFT_0 = Impl::integral_power_of_two(N0) }; - enum { SHIFT_1 = Impl::integral_power_of_two(N1) }; - enum { MASK_0 = N0 - 1 }; - enum { MASK_1 = N1 - 1 }; - - typedef size_t size_type ; - typedef ShapeType shape_type ; - typedef LayoutTileLeft<N0,N1,true> array_layout ; - - enum { has_padding = true }; - - size_type tile_N0 ; - - KOKKOS_INLINE_FUNCTION - void assign( const ViewOffset & rhs ) - { - shape_type::N0 = rhs.N0 ; - shape_type::N1 = rhs.N1 ; - tile_N0 = ( rhs.N0 + MASK_0 ) >> SHIFT_0 ; // number of tiles in first dimension - } - - KOKKOS_INLINE_FUNCTION - void assign( size_t n0 , size_t n1 - , int = 0 , int = 0 - , int = 0 , int = 0 - , int = 0 , int = 0 - , int = 0 - ) - { - shape_type::N0 = n0 ; - shape_type::N1 = n1 ; - tile_N0 = ( n0 + MASK_0 ) >> SHIFT_0 ; // number of tiles in first dimension - } - - - KOKKOS_INLINE_FUNCTION - void set_padding() {} - - - template< typename I0 , typename I1 > - KOKKOS_INLINE_FUNCTION - size_type operator()( I0 const & i0 , I1 const & i1 - , int = 0 , int = 0 - , int = 0 , int = 0 - , int = 0 , int = 0 - ) const - { - return /* ( ( Tile offset ) * ( Tile size ) ) */ - ( ( (i0>>SHIFT_0) + tile_N0 * (i1>>SHIFT_1) ) << (SHIFT_0 + SHIFT_1) ) + - /* ( Offset within tile ) */ - ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ; - } - - template< typename I0 , typename I1 > - KOKKOS_INLINE_FUNCTION - size_type tile_begin( I0 const & i_tile0 , I1 const & i_tile1 ) const - { - return ( i_tile0 + tile_N0 * i_tile1 ) << ( SHIFT_0 + SHIFT_1 ); - } - - - KOKKOS_INLINE_FUNCTION - size_type capacity() const - { - // ( TileDim0 * ( TileDim1 ) ) * TileSize - return ( tile_N0 * ( ( shape_type::N1 + MASK_1 ) >> SHIFT_1 ) ) << ( SHIFT_0 + SHIFT_1 ); - } -}; - -template<> -struct ViewAssignment< ViewTile , void , void > -{ - // Some compilers have type-matching issues on the integer values when using: - // template< class T , unsigned N0 , unsigned N1 , class A2 , class A3 > - template< class T , unsigned dN0 , unsigned dN1 - , class A2 , class A3 - , unsigned sN0 , unsigned sN1 > - KOKKOS_INLINE_FUNCTION - ViewAssignment( View< T[dN0][dN1], LayoutLeft, A2, A3, Impl::ViewDefault > & dst - , View< T** , LayoutTileLeft<sN0,sN1,true>, A2, A3, Impl::ViewDefault > const & src - , size_t const i_tile0 - , typename Impl::enable_if< unsigned(dN0) == unsigned(sN0) && - unsigned(dN1) == unsigned(sN1) - , size_t const - >::type i_tile1 - ) - { - // Destination is always contiguous but source may be non-contiguous - // so don't assign the whole view management object. - // Just query and appropriately set the reference-count state. - - if ( ! src.m_management.is_managed() ) dst.m_management.set_unmanaged(); - - dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map.tile_begin(i_tile0,i_tile1); - - dst.m_tracker = src.m_tracker; - } -}; - -} /* namespace Impl */ -} /* namespace Kokkos */ - -namespace Kokkos { - -template< class T , unsigned N0, unsigned N1, class A2, class A3 > -KOKKOS_INLINE_FUNCTION -View< T[N0][N1], LayoutLeft, A2, A3, Impl::ViewDefault > -tile_subview( const View<T**,LayoutTileLeft<N0,N1,true>,A2,A3,Impl::ViewDefault> & src - , const size_t i_tile0 - , const size_t i_tile1 - ) -{ - View< T[N0][N1], LayoutLeft, A2, A3, Impl::ViewDefault > dst ; - - (void) Impl::ViewAssignment< Impl::ViewTile , void , void >( dst , src , i_tile0 , i_tile1 ); - - return dst ; -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif - #endif /* #ifndef KOKKOS_VIEWTILELEFT_HPP */ diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt index e835245e25..5bb2b672e1 100644 --- a/lib/kokkos/core/unit_test/CMakeLists.txt +++ b/lib/kokkos/core/unit_test/CMakeLists.txt @@ -44,7 +44,7 @@ ENDIF() IF(Kokkos_ENABLE_OpenMP) TRIBITS_ADD_EXECUTABLE_AND_TEST( UnitTest_OpenMP - SOURCES UnitTestMain.cpp TestOpenMP.cpp + SOURCES UnitTestMain.cpp TestOpenMP.cpp TestOpenMP_a.cpp TestOpenMP_b.cpp TestOpenMP_c.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " @@ -66,7 +66,7 @@ ENDIF() IF(Kokkos_ENABLE_Cuda) TRIBITS_ADD_EXECUTABLE_AND_TEST( UnitTest_Cuda - SOURCES UnitTestMain.cpp TestCuda.cpp + SOURCES UnitTestMain.cpp TestCuda.cpp TestCuda_a.cpp TestCuda_b.cpp TestCuda_c.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " @@ -76,27 +76,30 @@ ENDIF() TRIBITS_ADD_EXECUTABLE_AND_TEST( UnitTest_Default - SOURCES UnitTestMain.cpp TestDefaultDeviceType.cpp TestDefaultDeviceTypeInit.cpp + SOURCES UnitTestMain.cpp TestDefaultDeviceType.cpp TestDefaultDeviceType_a.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " TESTONLYLIBS kokkos_gtest ) +foreach(INITTESTS_NUM RANGE 1 16) TRIBITS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HWLOC - SOURCES UnitTestMain.cpp TestHWLOC.cpp + UnitTest_DefaultInit_${INITTESTS_NUM} + SOURCES UnitTestMain.cpp TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " TESTONLYLIBS kokkos_gtest ) +endforeach(INITTESTS_NUM) TRIBITS_ADD_EXECUTABLE_AND_TEST( - UnitTest_AllocationTracker - SOURCES UnitTestMain.cpp TestAllocationTracker.cpp + UnitTest_HWLOC + SOURCES UnitTestMain.cpp TestHWLOC.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ) + diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile index 6e0f56a62c..3d9d212c1e 100644 --- a/lib/kokkos/core/unit_test/Makefile +++ b/lib/kokkos/core/unit_test/Makefile @@ -61,17 +61,16 @@ OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o TARGETS += KokkosCore_UnitTest_HWLOC TEST_TARGETS += test-hwloc -OBJ_ALLOCATIONTRACKER = TestAllocationTracker.o UnitTestMain.o gtest-all.o -TARGETS += KokkosCore_UnitTest_AllocationTracker -TEST_TARGETS += test-allocationtracker - -OBJ_DEFAULT = TestDefaultDeviceType.o UnitTestMain.o gtest-all.o +OBJ_DEFAULT = TestDefaultDeviceType.o TestDefaultDeviceType_a.o UnitTestMain.o gtest-all.o TARGETS += KokkosCore_UnitTest_Default TEST_TARGETS += test-default -OBJ_DEFAULTINIT = TestDefaultDeviceTypeInit.o UnitTestMain.o gtest-all.o -TARGETS += KokkosCore_UnitTest_DefaultInit -TEST_TARGETS += test-default-init +NUM_INITTESTS = 16 +INITTESTS_NUMBERS := $(shell seq 1 ${NUM_INITTESTS}) +INITTESTS_TARGETS := $(addprefix KokkosCore_UnitTest_DefaultDeviceTypeInit_,${INITTESTS_NUMBERS}) +TARGETS += ${INITTESTS_TARGETS} +INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS}) +TEST_TARGETS += ${INITTESTS_TEST_TARGETS} OBJ_SYNCHRONIC = TestSynchronic.o UnitTestMain.o gtest-all.o TARGETS += KokkosCore_UnitTest_Synchronic @@ -101,8 +100,8 @@ KokkosCore_UnitTest_AllocationTracker: $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LINK_DE KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Default -KokkosCore_UnitTest_DefaultInit: $(OBJ_DEFAULTINIT) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_DEFAULTINIT) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultInit +${INITTESTS_TARGETS}: KokkosCore_UnitTest_DefaultDeviceTypeInit_%: TestDefaultDeviceTypeInit_%.o UnitTestMain.o gtest-all.o $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) TestDefaultDeviceTypeInit_$*.o UnitTestMain.o gtest-all.o $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultDeviceTypeInit_$* KokkosCore_UnitTest_Synchronic: $(OBJ_SYNCHRONIC) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SYNCHRONIC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Synchronic @@ -131,8 +130,8 @@ test-allocationtracker: KokkosCore_UnitTest_AllocationTracker test-default: KokkosCore_UnitTest_Default ./KokkosCore_UnitTest_Default -test-default-init: KokkosCore_UnitTest_DefaultInit - ./KokkosCore_UnitTest_DefaultInit +${INITTESTS_TEST_TARGETS}: test-default-init-%: KokkosCore_UnitTest_DefaultDeviceTypeInit_% + ./KokkosCore_UnitTest_DefaultDeviceTypeInit_$* test-synchronic: KokkosCore_UnitTest_Synchronic ./KokkosCore_UnitTest_Synchronic diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp index 1fbb4bf4b1..5388a60787 100644 --- a/lib/kokkos/core/unit_test/TestAggregate.hpp +++ b/lib/kokkos/core/unit_test/TestAggregate.hpp @@ -52,665 +52,6 @@ /*--------------------------------------------------------------------------*/ -#if ! KOKKOS_USING_EXP_VIEW - -namespace Test { - -struct EmbedArray {}; - -struct ArrayProxyContiguous {}; -struct ArrayProxyStrided {}; - -template< typename T , unsigned N = 0 , class Proxy = void > -struct Array ; - -template< typename T > -struct Array<T,0,ArrayProxyContiguous> -{ -public: - typedef T value_type ; - - enum { StaticLength = 0 }; - T * const value ; - const unsigned count ; - - KOKKOS_INLINE_FUNCTION - Array( T * v , unsigned n ) : value(v), count(n) {} - - template< class Proxy > - KOKKOS_INLINE_FUNCTION - Array & operator = ( const Array<T,0,Proxy> & rhs ) { return *this ; } -}; - -template< typename T , unsigned N > -struct Array<T,N,ArrayProxyContiguous> -{ -public: - typedef T value_type ; - - enum { StaticLength = N }; - T * const value ; - - KOKKOS_INLINE_FUNCTION - Array( T * v , unsigned ) : value(v) {} - - template< class Proxy > - KOKKOS_INLINE_FUNCTION - Array & operator = ( const Array<T,N,Proxy> & rhs ) { return *this ; } -}; - -template< typename T , unsigned N > -struct Array<T,N,ArrayProxyStrided> -{ -public: - typedef T value_type ; - - enum { StaticLength = N }; - T * const value ; - const unsigned stride ; - - KOKKOS_INLINE_FUNCTION - Array( T * v , unsigned , unsigned s ) : value(v), stride(s) {} - - template< class Proxy > - KOKKOS_INLINE_FUNCTION - Array & operator = ( const Array<T,N,Proxy> & rhs ) { return *this ; } -}; - -template< typename T > -struct Array<T,0,ArrayProxyStrided> -{ -public: - typedef T value_type ; - - enum { StaticLength = 0 }; - T * const value ; - const unsigned count ; - const unsigned stride ; - - KOKKOS_INLINE_FUNCTION - Array( T * v , unsigned n , unsigned s ) : value(v), count(n), stride(s) {} - - template< class Proxy > - KOKKOS_INLINE_FUNCTION - Array & operator = ( const Array<T,0,Proxy> & rhs ) { return *this ; } -}; - -template< typename T > -struct Array<T,0,void> -{ -public: - typedef T value_type ; - - enum { StaticLength = 0 }; - T * value ; - const unsigned count ; - - KOKKOS_INLINE_FUNCTION - Array() : value(0) , count(0) {} - - template< unsigned N , class Proxy > - KOKKOS_INLINE_FUNCTION - Array( const Array<T,N,Proxy> & rhs ) : value(rhs.value), count(N) {} -}; - -template< typename T , unsigned N > -struct Array<T,N,void> -{ -public: - typedef T value_type ; - - enum { StaticLength = N }; - T value[N] ; - - template< class Proxy > - KOKKOS_INLINE_FUNCTION - Array & operator = ( const Array<T,N,Proxy> & ) { return *this ; } -}; - -} // namespace Test - -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Impl { - -template< typename T , unsigned N > -struct AnalyzeShape< Test::Array< T , N > > - : public ShapeInsert< typename AnalyzeShape< T >::shape , N >::type -{ -private: - typedef AnalyzeShape< T > nested ; -public: - - typedef Test::EmbedArray specialize ; - - typedef typename ShapeInsert< typename nested::shape , N >::type shape ; - - typedef typename nested::array_intrinsic_type array_intrinsic_type[ N ]; - typedef Test::Array< T , N > value_type ; - typedef Test::Array< T , N > type ; - - typedef const array_intrinsic_type const_array_intrinsic_type ; - typedef const value_type const_value_type ; - typedef const type const_type ; - - typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type[ N ]; - typedef Test::Array< typename nested::non_const_value_type , N > non_const_value_type ; - typedef Test::Array< typename nested::non_const_value_type , N > non_const_type ; -}; - -template< typename T > -struct AnalyzeShape< Test::Array< T , 0 > > - : public ShapeInsert< typename AnalyzeShape< T >::shape , 0 >::type -{ -private: - typedef AnalyzeShape< T > nested ; -public: - - typedef Test::EmbedArray specialize ; - - typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ; - - typedef typename nested::array_intrinsic_type * array_intrinsic_type ; - typedef Test::Array< T , 0 > value_type ; - typedef Test::Array< T , 0 > type ; - - typedef const array_intrinsic_type const_array_intrinsic_type ; - typedef const value_type const_value_type ; - typedef const type const_type ; - - typedef typename nested::non_const_array_intrinsic_type * non_const_array_intrinsic_type ; - typedef Test::Array< typename nested::non_const_value_type , 0 > non_const_value_type ; - typedef Test::Array< typename nested::non_const_value_type , 0 > non_const_type ; -}; - -/*--------------------------------------------------------------------------*/ - -template< class ValueType , class MemorySpace , class MemoryTraits > -struct ViewSpecialize< ValueType - , Test::EmbedArray - , LayoutLeft - , MemorySpace - , MemoryTraits > -{ typedef Test::EmbedArray type ; }; - -template< class ValueType , class MemorySpace , class MemoryTraits > -struct ViewSpecialize< ValueType - , Test::EmbedArray - , LayoutRight - , MemorySpace - , MemoryTraits > -{ typedef Test::EmbedArray type ; }; - -/*--------------------------------------------------------------------------*/ - -template<> -struct ViewAssignment< Test::EmbedArray , Test::EmbedArray , void > -{ - //------------------------------------ - /** \brief Compatible value and shape */ - - template< class DT , class DL , class DD , class DM , - class ST , class SL , class SD , class SM > - KOKKOS_INLINE_FUNCTION - ViewAssignment( View<DT,DL,DD,DM,Test::EmbedArray> & dst - , const View<ST,SL,SD,SM,Test::EmbedArray> & src - , const typename enable_if<( - ViewAssignable< ViewTraits<DT,DL,DD,DM> , - ViewTraits<ST,SL,SD,SM> >::value - )>::type * = 0 - ) - { - dst.m_offset_map.assign( src.m_offset_map ); - - dst.m_ptr_on_device = src.m_ptr_on_device ; - - dst.m_tracker = src.m_tracker; - } -}; - -template<> -struct ViewAssignment< ViewDefault , Test::EmbedArray , void > -{ - //------------------------------------ - /** \brief Compatible value and shape */ - - template< class ST , class SL , class SD , class SM > - KOKKOS_INLINE_FUNCTION - ViewAssignment( typename View<ST,SL,SD,SM,Test::EmbedArray>::array_type & dst - , const View<ST,SL,SD,SM,Test::EmbedArray> & src - ) - { - dst.m_offset_map.assign( src.m_offset_map ); - - dst.m_ptr_on_device = src.m_ptr_on_device ; - - dst.m_tracker = src.m_tracker; - } -}; - - -} // namespace Impl -} // namespace Kokkos - -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { - -template< class DataType , - class Arg1Type , - class Arg2Type , - class Arg3Type > -class View< DataType , Arg1Type , Arg2Type , Arg3Type , Test::EmbedArray > - : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > -{ -public: - - typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ; - -private: - - // Assignment of compatible views requirement: - template< class , class , class , class , class > friend class View ; - - // Assignment of compatible subview requirement: - template< class , class , class > friend struct Impl::ViewAssignment ; - - typedef Impl::ViewOffset< typename traits::shape_type , - typename traits::array_layout > offset_map_type ; - - typedef Impl::ViewDataManagement< traits > view_data_management ; - - // traits::value_type = Test::Array< T , N > - - typename traits::value_type::value_type * m_ptr_on_device ; - offset_map_type m_offset_map ; - view_data_management m_management ; - Impl::AllocationTracker m_tracker ; - -public: - - typedef View< typename traits::array_intrinsic_type , - typename traits::array_layout , - typename traits::execution_space , - typename traits::memory_traits > array_type ; - - typedef View< typename traits::non_const_data_type , - typename traits::array_layout , - typename traits::execution_space , - typename traits::memory_traits > non_const_type ; - - typedef View< typename traits::const_data_type , - typename traits::array_layout , - typename traits::execution_space , - typename traits::memory_traits > const_type ; - - typedef View< typename traits::non_const_data_type , - typename traits::array_layout , - typename traits::host_mirror_space , - void > HostMirror ; - - //------------------------------------ - // Shape - - enum { Rank = traits::rank - 1 }; - - KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_offset_map ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_offset_map.N0 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; } - KOKKOS_INLINE_FUNCTION typename traits::size_type size() const - { - return m_offset_map.N0 - * m_offset_map.N1 - * m_offset_map.N2 - * m_offset_map.N3 - * m_offset_map.N4 - * m_offset_map.N5 - * m_offset_map.N6 - * m_offset_map.N7 - ; - } - - template< typename iType > - KOKKOS_INLINE_FUNCTION - typename traits::size_type dimension( const iType & i ) const - { return Impl::dimension( m_offset_map , i ); } - - //------------------------------------ - // Destructor, constructors, assignment operators: - - KOKKOS_INLINE_FUNCTION - ~View() {} - - KOKKOS_INLINE_FUNCTION - View() - : m_ptr_on_device(0) - , m_offset_map() - , m_management() - , m_tracker() - { m_offset_map.assing(0,0,0,0,0,0,0,0); } - - KOKKOS_INLINE_FUNCTION - View( const View & rhs ) - : m_ptr_on_device(0) - , m_offset_map() - , m_management() - , m_tracker() - { - (void) Impl::ViewAssignment< - typename traits::specialize , - typename traits::specialize >( *this , rhs ); - } - - KOKKOS_INLINE_FUNCTION - View & operator = ( const View & rhs ) - { - (void) Impl::ViewAssignment< - typename traits::specialize , - typename traits::specialize >( *this , rhs ); - return *this ; - } - - //------------------------------------ - // Construct or assign compatible view: - - template< class RT , class RL , class RD , class RM , class RS > - KOKKOS_INLINE_FUNCTION - View( const View<RT,RL,RD,RM,RS> & rhs ) - : m_ptr_on_device(0) - , m_offset_map() - , m_management() - , m_tracker() - { - (void) Impl::ViewAssignment< - typename traits::specialize , RS >( *this , rhs ); - } - - template< class RT , class RL , class RD , class RM , class RS > - KOKKOS_INLINE_FUNCTION - View & operator = ( const View<RT,RL,RD,RM,RS> & rhs ) - { - (void) Impl::ViewAssignment< - typename traits::specialize , RS >( *this , rhs ); - return *this ; - } - - //------------------------------------ - // Allocation of a managed view with possible alignment padding. - - template< class AllocationProperties > - explicit inline - View( const AllocationProperties & prop , - const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type n0 = 0 , - const size_t n1 = 0 , - const size_t n2 = 0 , - const size_t n3 = 0 , - const size_t n4 = 0 , - const size_t n5 = 0 , - const size_t n6 = 0 , - const size_t n7 = 0 ) - : m_ptr_on_device(0) - , m_offset_map() - , m_management() - , m_tracker() - { - typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ; - - typedef typename traits::memory_space memory_space ; - typedef typename traits::value_type::value_type scalar_type ; - - m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); - m_offset_map.set_padding(); - - m_tracker = memory_space::allocate_and_track( Alloc::label( prop ), sizeof(scalar_type) * m_offset_map.capacity() ); - - m_ptr_on_device = reinterpret_cast<scalar_type *>(m_tracker.alloc_ptr()); - - (void) Impl::ViewDefaultConstruct< typename traits::execution_space , scalar_type , Alloc::Initialize >( m_ptr_on_device , m_offset_map.capacity() ); - } - - //------------------------------------ - // Assign an unmanaged View from pointer, can be called in functors. - // No alignment padding is performed. - - typedef Impl::if_c< ! traits::is_managed , - typename traits::value_type::value_type * , - Impl::ViewError::user_pointer_constructor_requires_unmanaged > - if_user_pointer_constructor ; - - View( typename if_user_pointer_constructor::type ptr , - const size_t n0 = 0 , - const size_t n1 = 0 , - const size_t n2 = 0 , - const size_t n3 = 0 , - const size_t n4 = 0 , - const size_t n5 = 0 , - const size_t n6 = 0 , - const size_t n7 = 0 ) - : m_ptr_on_device(0) - , m_offset_map() - , m_management() - , m_tracker() - { - m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); - m_ptr_on_device = if_user_pointer_constructor::select( ptr ); - m_management.set_unmanaged(); - } - - //------------------------------------ - // Assign unmanaged View to portion of Device shared memory - - typedef Impl::if_c< ! traits::is_managed , - typename traits::execution_space , - Impl::ViewError::device_shmem_constructor_requires_unmanaged > - if_device_shmem_constructor ; - - explicit KOKKOS_INLINE_FUNCTION - View( typename if_device_shmem_constructor::type & dev , - const unsigned n0 = 0 , - const unsigned n1 = 0 , - const unsigned n2 = 0 , - const unsigned n3 = 0 , - const unsigned n4 = 0 , - const unsigned n5 = 0 , - const unsigned n6 = 0 , - const unsigned n7 = 0 ) - : m_ptr_on_device(0) - , m_offset_map() - , m_management() - , m_tracker() - { - typedef typename traits::value_type::value_type scalar_type ; - - enum { align = 8 }; - enum { mask = align - 1 }; - - typedef Impl::if_c< ! traits::is_managed , - scalar_type * , - Impl::ViewError::device_shmem_constructor_requires_unmanaged > - if_device_shmem_pointer ; - - m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); - - // Select the first argument: - m_ptr_on_device = if_device_shmem_pointer::select( - (scalar_type *) dev.get_shmem( unsigned( sizeof(scalar_type) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) ); - } - - static inline - unsigned shmem_size( const unsigned n0 = 0 , - const unsigned n1 = 0 , - const unsigned n2 = 0 , - const unsigned n3 = 0 , - const unsigned n4 = 0 , - const unsigned n5 = 0 , - const unsigned n6 = 0 , - const unsigned n7 = 0 ) - { - enum { align = 8 }; - enum { mask = align - 1 }; - - typedef typename traits::value_type::value_type scalar_type ; - - offset_map_type offset_map ; - - offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); - - return unsigned( sizeof(scalar_type) * offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ; - } - - //------------------------------------ - // Is not allocated - - KOKKOS_INLINE_FUNCTION - bool is_null() const { return 0 == m_ptr_on_device ; } - - //------------------------------------ - // LayoutLeft, rank 2: - - typedef Test::Array< typename traits::value_type::value_type , - traits::value_type::StaticLength , - Test::ArrayProxyStrided > LeftValue ; - - template< typename iType0 > - KOKKOS_INLINE_FUNCTION - typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type - operator[] ( const iType0 & i0 ) const - { - KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); - - return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 ); - } - - template< typename iType0 > - KOKKOS_INLINE_FUNCTION - typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type - operator() ( const iType0 & i0 ) const - { - KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); - - return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 ); - } - - template< typename iType0 > - KOKKOS_INLINE_FUNCTION - typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type - at( const iType0 & i0 , const int , const int , const int , - const int , const int , const int , const int ) const - { - KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); - - return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 ); - } - - //------------------------------------ - // LayoutRight, rank 2: - - typedef Test::Array< typename traits::value_type::value_type , - traits::value_type::StaticLength , - Test::ArrayProxyContiguous > RightValue ; - - template< typename iType0 > - KOKKOS_INLINE_FUNCTION - typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type - operator[] ( const iType0 & i0 ) const - { - KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); - - return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 ); - } - - template< typename iType0 > - KOKKOS_INLINE_FUNCTION - typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type - operator() ( const iType0 & i0 ) const - { - KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); - - return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 ); - } - - template< typename iType0 > - KOKKOS_INLINE_FUNCTION - typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type - at( const iType0 & i0 , const int , const int , const int , - const int , const int , const int , const int ) const - { - KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); - KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); - - return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 ); - } - - //------------------------------------ - // Access to the underlying contiguous storage of this view specialization. - // These methods are specific to specialization of a view. - - KOKKOS_INLINE_FUNCTION - typename traits::value_type::value_type * ptr_on_device() const { return m_ptr_on_device ; } - - // Stride of physical storage, dimensioned to at least Rank - template< typename iType > - KOKKOS_INLINE_FUNCTION - void stride( iType * const s ) const - { m_offset_map.stride( s ); } - - // Count of contiguously allocated data members including padding. - KOKKOS_INLINE_FUNCTION - typename traits::size_type capacity() const - { return m_offset_map.capacity(); } -}; - -} // namespace Kokkos - -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -namespace Test { - -template< class DeviceType > -int TestViewAggregate() -{ - typedef Kokkos::View< Test::Array<double,32> * , DeviceType > a32_type ; - typedef typename a32_type::array_type a32_base_type ; - - typedef Kokkos::View< Test::Array<double> * , DeviceType > a0_type ; - typedef typename a0_type::array_type a0_base_type ; - - a32_type a32("a32",100); - a32_base_type a32_base ; - - a0_type a0("a0",100,32); - a0_base_type a0_base ; - - a32_base = a32 ; - a0_base = a0 ; - - - return 0 ; -} - -} - -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -#else /* #if ! KOKKOS_USING_EXP_VIEW */ - #include <impl/KokkosExp_ViewArray.hpp> namespace Test { @@ -762,8 +103,6 @@ void TestViewAggregate() } -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/unit_test/TestAggregateReduction.hpp b/lib/kokkos/core/unit_test/TestAggregateReduction.hpp index b0b3747087..bd05cd347b 100644 --- a/lib/kokkos/core/unit_test/TestAggregateReduction.hpp +++ b/lib/kokkos/core/unit_test/TestAggregateReduction.hpp @@ -57,12 +57,10 @@ struct StaticArray { T value[N] ; KOKKOS_INLINE_FUNCTION - StaticArray() - { for ( unsigned i = 0 ; i < N ; ++i ) value[i] = T(); } + StaticArray() = default; KOKKOS_INLINE_FUNCTION - StaticArray( const StaticArray & rhs ) - { for ( unsigned i = 0 ; i < N ; ++i ) value[i] = rhs.value[i]; } + StaticArray( const StaticArray & rhs ) = default; KOKKOS_INLINE_FUNCTION operator T () { return value[0]; } @@ -75,11 +73,7 @@ struct StaticArray { } KOKKOS_INLINE_FUNCTION - StaticArray & operator = ( const StaticArray & rhs ) - { - for ( unsigned i = 0 ; i < N ; ++i ) value[i] = rhs.value[i] ; - return *this ; - } + StaticArray & operator = ( const StaticArray & rhs ) = default; KOKKOS_INLINE_FUNCTION StaticArray operator * ( const StaticArray & rhs ) @@ -111,6 +105,8 @@ struct StaticArray { } }; +static_assert(std::is_trivial<StaticArray<int, 4>>::value, "Not trivial"); + template< typename T , class Space > struct DOT { typedef T value_type ; diff --git a/lib/kokkos/core/unit_test/TestAllocationTracker.cpp b/lib/kokkos/core/unit_test/TestAllocationTracker.cpp deleted file mode 100644 index b3a7fe9803..0000000000 --- a/lib/kokkos/core/unit_test/TestAllocationTracker.cpp +++ /dev/null @@ -1,159 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#include <gtest/gtest.h> - -#include <iostream> -#include <vector> - -#include <Kokkos_Core.hpp> - -#include <impl/Kokkos_AllocationTracker.hpp> -#include <impl/Kokkos_BasicAllocators.hpp> - -namespace Test { - -class alocation_tracker : public ::testing::Test { -protected: - static void SetUpTestCase() - { - Kokkos::initialize(); - } - - static void TearDownTestCase() - { - Kokkos::finalize(); - } -}; - -TEST_F( alocation_tracker, simple) -{ - -#if ! KOKKOS_USING_EXP_VIEW - - using namespace Kokkos::Impl; - - { - AllocationTracker tracker; - EXPECT_FALSE( tracker.is_valid() ); - } - - // test ref count and label - { - int size = 100; - std::vector<AllocationTracker> trackers(size); - - trackers[0] = AllocationTracker( MallocAllocator(), 128,"Test"); - - for (int i=0; i<size; ++i) { - trackers[i] = trackers[0]; - } - - EXPECT_EQ(100u, trackers[0].ref_count()); - EXPECT_EQ(std::string("Test"), std::string(trackers[0].label())); - } - - - // test circular list - { - int num_allocs = 3000; - unsigned ref_count = 100; - - std::vector<AllocationTracker> trackers(num_allocs); - - for (int i=0; i<num_allocs; ++i) { - trackers[i] = AllocationTracker( MallocAllocator(), 128, "Test"); - std::vector<AllocationTracker> ref_trackers(ref_count); - for (unsigned j=0; j<ref_count; ++j) { - ref_trackers[j] = trackers[i]; - } - EXPECT_EQ( ref_count + 1u, trackers[i].ref_count() ); - } - - for (int i=0; i<num_allocs; ++i) { - EXPECT_EQ( 1u, trackers[i].ref_count() ); - } - } - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - -} - -TEST_F( alocation_tracker, force_leaks) -{ -// uncomment to force memory leaks -#if 0 - using namespace Kokkos::Impl; - Kokkos::kokkos_malloc("Forced Leak", 4096*10); - Kokkos::kokkos_malloc<Kokkos::HostSpace>("Forced Leak", 4096*10); -#endif -} - -TEST_F( alocation_tracker, disable_reference_counting) -{ - -#if ! KOKKOS_USING_EXP_VIEW - - using namespace Kokkos::Impl; - // test ref count and label - { - int size = 100; - std::vector<AllocationTracker> trackers(size); - - trackers[0] = AllocationTracker( MallocAllocator(), 128,"Test"); - - for (int i=1; i<size; ++i) { - Kokkos::Impl::AllocationTracker::disable_tracking(); - trackers[i] = trackers[0] ; - Kokkos::Impl::AllocationTracker::enable_tracking(); - } - - EXPECT_EQ(1u, trackers[0].ref_count()); - EXPECT_EQ(std::string("Test"), std::string(trackers[0].label())); - } - -#endif /* #if ! KOKKOS_USING_EXP_VIEW */ - -} - -} // namespace Test diff --git a/lib/kokkos/core/unit_test/TestAtomic.hpp b/lib/kokkos/core/unit_test/TestAtomic.hpp index 7b3ab14c06..e948723574 100644 --- a/lib/kokkos/core/unit_test/TestAtomic.hpp +++ b/lib/kokkos/core/unit_test/TestAtomic.hpp @@ -84,10 +84,9 @@ struct SuperScalar { } KOKKOS_INLINE_FUNCTION - volatile SuperScalar& operator = (const SuperScalar& src) volatile { + void operator = (const SuperScalar& src) volatile { for(int i=0; i<N; i++) val[i] = src.val[i]; - return *this; } KOKKOS_INLINE_FUNCTION @@ -208,6 +207,10 @@ T AddLoopSerial(int loop) { return val; } +//------------------------------------------------------ +//--------------atomic_compare_exchange----------------- +//------------------------------------------------------ + template<class T,class DEVICE_TYPE> struct CASFunctor{ typedef DEVICE_TYPE execution_space; @@ -270,6 +273,10 @@ T CASLoopSerial(int loop) { return val; } +//---------------------------------------------- +//--------------atomic_exchange----------------- +//---------------------------------------------- + template<class T,class DEVICE_TYPE> struct ExchFunctor{ typedef DEVICE_TYPE execution_space; diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp new file mode 100644 index 0000000000..aee4bda06c --- /dev/null +++ b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp @@ -0,0 +1,841 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +namespace TestAtomicOperations { + +//----------------------------------------------- +//--------------zero_functor--------------------- +//----------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct ZeroFunctor { + typedef DEVICE_TYPE execution_space; + typedef typename Kokkos::View<T,execution_space> type; + typedef typename Kokkos::View<T,execution_space>::HostMirror h_type; + type data; + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + data() = 0; + } +}; + +//----------------------------------------------- +//--------------init_functor--------------------- +//----------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct InitFunctor { + typedef DEVICE_TYPE execution_space; + typedef typename Kokkos::View<T,execution_space> type; + typedef typename Kokkos::View<T,execution_space>::HostMirror h_type; + type data; + T init_value ; + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + data() = init_value; + } + + InitFunctor(T _init_value) : init_value(_init_value) {} +}; + + +//--------------------------------------------------- +//--------------atomic_fetch_max--------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct MaxFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + //Kokkos::atomic_fetch_max(&data(),(T)1); + Kokkos::atomic_fetch_max(&data(),(T)i1); + } + MaxFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {} +}; + +template<class T, class execution_space > +T MaxAtomic(T i0 , T i1) { + struct InitFunctor<T,execution_space> f_init(i0); + typename InitFunctor<T,execution_space>::type data("Data"); + typename InitFunctor<T,execution_space>::h_type h_data("HData"); + f_init.data = data; + Kokkos::parallel_for(1,f_init); + execution_space::fence(); + + struct MaxFunctor<T,execution_space> f(i0,i1); + f.data = data; + Kokkos::parallel_for(1,f); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T> +T MaxAtomicCheck(T i0 , T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = (i0 > i1 ? i0 : i1) ; + + T val = *data; + delete [] data; + return val; +} + +template<class T,class DeviceType> +bool MaxAtomicTest(T i0, T i1) +{ + T res = MaxAtomic<T,DeviceType>(i0,i1); + T resSerial = MaxAtomicCheck<T>(i0,i1); + + bool passed = true; + + if ( resSerial != res ) { + passed = false; + + std::cout << "Loop<" + << typeid(T).name() + << ">( test = MaxAtomicTest" + << " FAILED : " + << resSerial << " != " << res + << std::endl ; + } + + return passed ; +} + +//--------------------------------------------------- +//--------------atomic_fetch_min--------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct MinFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + Kokkos::atomic_fetch_min(&data(),(T)i1); + } + MinFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {} +}; + +template<class T, class execution_space > +T MinAtomic(T i0 , T i1) { + struct InitFunctor<T,execution_space> f_init(i0); + typename InitFunctor<T,execution_space>::type data("Data"); + typename InitFunctor<T,execution_space>::h_type h_data("HData"); + f_init.data = data; + Kokkos::parallel_for(1,f_init); + execution_space::fence(); + + struct MinFunctor<T,execution_space> f(i0,i1); + f.data = data; + Kokkos::parallel_for(1,f); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T> +T MinAtomicCheck(T i0 , T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = (i0 < i1 ? i0 : i1) ; + + T val = *data; + delete [] data; + return val; +} + +template<class T,class DeviceType> +bool MinAtomicTest(T i0, T i1) +{ + T res = MinAtomic<T,DeviceType>(i0,i1); + T resSerial = MinAtomicCheck<T>(i0,i1); + + bool passed = true; + + if ( resSerial != res ) { + passed = false; + + std::cout << "Loop<" + << typeid(T).name() + << ">( test = MinAtomicTest" + << " FAILED : " + << resSerial << " != " << res + << std::endl ; + } + + return passed ; +} + +//--------------------------------------------------- +//--------------atomic_fetch_mul--------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct MulFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + Kokkos::atomic_fetch_mul(&data(),(T)i1); + } + MulFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {} +}; + +template<class T, class execution_space > +T MulAtomic(T i0 , T i1) { + struct InitFunctor<T,execution_space> f_init(i0); + typename InitFunctor<T,execution_space>::type data("Data"); + typename InitFunctor<T,execution_space>::h_type h_data("HData"); + f_init.data = data; + Kokkos::parallel_for(1,f_init); + execution_space::fence(); + + struct MulFunctor<T,execution_space> f(i0,i1); + f.data = data; + Kokkos::parallel_for(1,f); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T> +T MulAtomicCheck(T i0 , T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0*i1 ; + + T val = *data; + delete [] data; + return val; +} + +template<class T,class DeviceType> +bool MulAtomicTest(T i0, T i1) +{ + T res = MulAtomic<T,DeviceType>(i0,i1); + T resSerial = MulAtomicCheck<T>(i0,i1); + + bool passed = true; + + if ( resSerial != res ) { + passed = false; + + std::cout << "Loop<" + << typeid(T).name() + << ">( test = MulAtomicTest" + << " FAILED : " + << resSerial << " != " << res + << std::endl ; + } + + return passed ; +} + +//--------------------------------------------------- +//--------------atomic_fetch_div--------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct DivFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + Kokkos::atomic_fetch_div(&data(),(T)i1); + } + DivFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {} +}; + +template<class T, class execution_space > +T DivAtomic(T i0 , T i1) { + struct InitFunctor<T,execution_space> f_init(i0); + typename InitFunctor<T,execution_space>::type data("Data"); + typename InitFunctor<T,execution_space>::h_type h_data("HData"); + f_init.data = data; + Kokkos::parallel_for(1,f_init); + execution_space::fence(); + + struct DivFunctor<T,execution_space> f(i0,i1); + f.data = data; + Kokkos::parallel_for(1,f); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T> +T DivAtomicCheck(T i0 , T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0/i1 ; + + T val = *data; + delete [] data; + return val; +} + +template<class T,class DeviceType> +bool DivAtomicTest(T i0, T i1) +{ + T res = DivAtomic<T,DeviceType>(i0,i1); + T resSerial = DivAtomicCheck<T>(i0,i1); + + bool passed = true; + + if ( resSerial != res ) { + passed = false; + + std::cout << "Loop<" + << typeid(T).name() + << ">( test = DivAtomicTest" + << " FAILED : " + << resSerial << " != " << res + << std::endl ; + } + + return passed ; +} + +//--------------------------------------------------- +//--------------atomic_fetch_mod--------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct ModFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + Kokkos::atomic_fetch_mod(&data(),(T)i1); + } + ModFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {} +}; + +template<class T, class execution_space > +T ModAtomic(T i0 , T i1) { + struct InitFunctor<T,execution_space> f_init(i0); + typename InitFunctor<T,execution_space>::type data("Data"); + typename InitFunctor<T,execution_space>::h_type h_data("HData"); + f_init.data = data; + Kokkos::parallel_for(1,f_init); + execution_space::fence(); + + struct ModFunctor<T,execution_space> f(i0,i1); + f.data = data; + Kokkos::parallel_for(1,f); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T> +T ModAtomicCheck(T i0 , T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0%i1 ; + + T val = *data; + delete [] data; + return val; +} + +template<class T,class DeviceType> +bool ModAtomicTest(T i0, T i1) +{ + T res = ModAtomic<T,DeviceType>(i0,i1); + T resSerial = ModAtomicCheck<T>(i0,i1); + + bool passed = true; + + if ( resSerial != res ) { + passed = false; + + std::cout << "Loop<" + << typeid(T).name() + << ">( test = ModAtomicTest" + << " FAILED : " + << resSerial << " != " << res + << std::endl ; + } + + return passed ; +} + +//--------------------------------------------------- +//--------------atomic_fetch_and--------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct AndFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + Kokkos::atomic_fetch_and(&data(),(T)i1); + } + AndFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {} +}; + +template<class T, class execution_space > +T AndAtomic(T i0 , T i1) { + struct InitFunctor<T,execution_space> f_init(i0); + typename InitFunctor<T,execution_space>::type data("Data"); + typename InitFunctor<T,execution_space>::h_type h_data("HData"); + f_init.data = data; + Kokkos::parallel_for(1,f_init); + execution_space::fence(); + + struct AndFunctor<T,execution_space> f(i0,i1); + f.data = data; + Kokkos::parallel_for(1,f); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T> +T AndAtomicCheck(T i0 , T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0&i1 ; + + T val = *data; + delete [] data; + return val; +} + +template<class T,class DeviceType> +bool AndAtomicTest(T i0, T i1) +{ + T res = AndAtomic<T,DeviceType>(i0,i1); + T resSerial = AndAtomicCheck<T>(i0,i1); + + bool passed = true; + + if ( resSerial != res ) { + passed = false; + + std::cout << "Loop<" + << typeid(T).name() + << ">( test = AndAtomicTest" + << " FAILED : " + << resSerial << " != " << res + << std::endl ; + } + + return passed ; +} + +//--------------------------------------------------- +//--------------atomic_fetch_or---------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct OrFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + Kokkos::atomic_fetch_or(&data(),(T)i1); + } + OrFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {} +}; + +template<class T, class execution_space > +T OrAtomic(T i0 , T i1) { + struct InitFunctor<T,execution_space> f_init(i0); + typename InitFunctor<T,execution_space>::type data("Data"); + typename InitFunctor<T,execution_space>::h_type h_data("HData"); + f_init.data = data; + Kokkos::parallel_for(1,f_init); + execution_space::fence(); + + struct OrFunctor<T,execution_space> f(i0,i1); + f.data = data; + Kokkos::parallel_for(1,f); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T> +T OrAtomicCheck(T i0 , T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0|i1 ; + + T val = *data; + delete [] data; + return val; +} + +template<class T,class DeviceType> +bool OrAtomicTest(T i0, T i1) +{ + T res = OrAtomic<T,DeviceType>(i0,i1); + T resSerial = OrAtomicCheck<T>(i0,i1); + + bool passed = true; + + if ( resSerial != res ) { + passed = false; + + std::cout << "Loop<" + << typeid(T).name() + << ">( test = OrAtomicTest" + << " FAILED : " + << resSerial << " != " << res + << std::endl ; + } + + return passed ; +} + +//--------------------------------------------------- +//--------------atomic_fetch_xor--------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct XorFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + Kokkos::atomic_fetch_xor(&data(),(T)i1); + } + XorFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {} +}; + +template<class T, class execution_space > +T XorAtomic(T i0 , T i1) { + struct InitFunctor<T,execution_space> f_init(i0); + typename InitFunctor<T,execution_space>::type data("Data"); + typename InitFunctor<T,execution_space>::h_type h_data("HData"); + f_init.data = data; + Kokkos::parallel_for(1,f_init); + execution_space::fence(); + + struct XorFunctor<T,execution_space> f(i0,i1); + f.data = data; + Kokkos::parallel_for(1,f); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T> +T XorAtomicCheck(T i0 , T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0^i1 ; + + T val = *data; + delete [] data; + return val; +} + +template<class T,class DeviceType> +bool XorAtomicTest(T i0, T i1) +{ + T res = XorAtomic<T,DeviceType>(i0,i1); + T resSerial = XorAtomicCheck<T>(i0,i1); + + bool passed = true; + + if ( resSerial != res ) { + passed = false; + + std::cout << "Loop<" + << typeid(T).name() + << ">( test = XorAtomicTest" + << " FAILED : " + << resSerial << " != " << res + << std::endl ; + } + + return passed ; +} + +//--------------------------------------------------- +//--------------atomic_fetch_lshift--------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct LShiftFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + Kokkos::atomic_fetch_lshift(&data(),(T)i1); + } + LShiftFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {} +}; + +template<class T, class execution_space > +T LShiftAtomic(T i0 , T i1) { + struct InitFunctor<T,execution_space> f_init(i0); + typename InitFunctor<T,execution_space>::type data("Data"); + typename InitFunctor<T,execution_space>::h_type h_data("HData"); + f_init.data = data; + Kokkos::parallel_for(1,f_init); + execution_space::fence(); + + struct LShiftFunctor<T,execution_space> f(i0,i1); + f.data = data; + Kokkos::parallel_for(1,f); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T> +T LShiftAtomicCheck(T i0 , T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0<<i1 ; + + T val = *data; + delete [] data; + return val; +} + +template<class T,class DeviceType> +bool LShiftAtomicTest(T i0, T i1) +{ + T res = LShiftAtomic<T,DeviceType>(i0,i1); + T resSerial = LShiftAtomicCheck<T>(i0,i1); + + bool passed = true; + + if ( resSerial != res ) { + passed = false; + + std::cout << "Loop<" + << typeid(T).name() + << ">( test = LShiftAtomicTest" + << " FAILED : " + << resSerial << " != " << res + << std::endl ; + } + + return passed ; +} + +//--------------------------------------------------- +//--------------atomic_fetch_rshift--------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct RShiftFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + Kokkos::atomic_fetch_rshift(&data(),(T)i1); + } + RShiftFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {} +}; + +template<class T, class execution_space > +T RShiftAtomic(T i0 , T i1) { + struct InitFunctor<T,execution_space> f_init(i0); + typename InitFunctor<T,execution_space>::type data("Data"); + typename InitFunctor<T,execution_space>::h_type h_data("HData"); + f_init.data = data; + Kokkos::parallel_for(1,f_init); + execution_space::fence(); + + struct RShiftFunctor<T,execution_space> f(i0,i1); + f.data = data; + Kokkos::parallel_for(1,f); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T> +T RShiftAtomicCheck(T i0 , T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0>>i1 ; + + T val = *data; + delete [] data; + return val; +} + +template<class T,class DeviceType> +bool RShiftAtomicTest(T i0, T i1) +{ + T res = RShiftAtomic<T,DeviceType>(i0,i1); + T resSerial = RShiftAtomicCheck<T>(i0,i1); + + bool passed = true; + + if ( resSerial != res ) { + passed = false; + + std::cout << "Loop<" + << typeid(T).name() + << ">( test = RShiftAtomicTest" + << " FAILED : " + << resSerial << " != " << res + << std::endl ; + } + + return passed ; +} + + +//--------------------------------------------------- +//--------------atomic_test_control------------------ +//--------------------------------------------------- + +template<class T,class DeviceType> +bool AtomicOperationsTestIntegralType( int i0 , int i1 , int test ) +{ + switch (test) { + case 1: return MaxAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 2: return MinAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 3: return MulAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 4: return DivAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 5: return ModAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 6: return AndAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 7: return OrAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 8: return XorAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 9: return LShiftAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 10: return RShiftAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + } + return 0; +} + +template<class T,class DeviceType> +bool AtomicOperationsTestNonIntegralType( int i0 , int i1 , int test ) +{ + switch (test) { + case 1: return MaxAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 2: return MinAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 3: return MulAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + case 4: return DivAtomicTest<T,DeviceType>( (T)i0 , (T)i1 ); + } + return 0; +} + +} // namespace + diff --git a/lib/kokkos/core/unit_test/TestCuda.cpp b/lib/kokkos/core/unit_test/TestCuda.cpp index 3958c1a344..e615566252 100644 --- a/lib/kokkos/core/unit_test/TestCuda.cpp +++ b/lib/kokkos/core/unit_test/TestCuda.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -81,24 +81,31 @@ #include <TestTaskPolicy.hpp> #include <TestPolicyConstruction.hpp> +#include <TestMDRange.hpp> + //---------------------------------------------------------------------------- class cuda : public ::testing::Test { protected: - static void SetUpTestCase() + static void SetUpTestCase(); + static void TearDownTestCase(); +}; + +void cuda::SetUpTestCase() { Kokkos::Cuda::print_configuration( std::cout ); Kokkos::HostSpace::execution_space::initialize(); Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) ); } - static void TearDownTestCase() + +void cuda::TearDownTestCase() { Kokkos::Cuda::finalize(); Kokkos::HostSpace::execution_space::finalize(); } -}; //---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- namespace Test { @@ -116,6 +123,11 @@ void test_cuda_spaces_int_value( int * ptr ) if ( *ptr == 42 ) { *ptr = 2 * 42 ; } } +TEST_F( cuda , md_range ) { + TestMDRange_2D< Kokkos::Cuda >::test_for2(100,100); + + TestMDRange_3D< Kokkos::Cuda >::test_for3(100,100,100); +} TEST_F( cuda , compiler_macros ) { @@ -223,7 +235,6 @@ struct TestViewCudaTexture { } }; - TEST_F( cuda , impl_view_texture ) { TestViewCudaTexture< Kokkos::CudaSpace >::run(); @@ -265,7 +276,6 @@ struct TestViewCudaAccessible { } }; - TEST_F( cuda , impl_view_accessible ) { TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run(); @@ -276,338 +286,5 @@ TEST_F( cuda , impl_view_accessible ) TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run(); TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run(); } -/* -//---------------------------------------------------------------------------- - -TEST_F( cuda, view_impl ) -{ - // test_abort<<<32,32>>>(); // Aborts the kernel with CUDA version 4.1 or greater - - test_view_impl< Kokkos::Cuda >(); -} - -TEST_F( cuda, view_api ) -{ - typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess > > view_texture_managed ; - typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess | Kokkos::Unmanaged > > view_texture_unmanaged ; - - TestViewAPI< double , Kokkos::Cuda >(); - TestViewAPI< double , Kokkos::CudaUVMSpace >(); - -#if 0 - Kokkos::View<double, Kokkos::Cuda > x("x"); - Kokkos::View<double[1], Kokkos::Cuda > y("y"); - // *x = 10 ; - // x() = 10 ; - // y[0] = 10 ; - // y(0) = 10 ; -#endif -} - - -TEST_F( cuda , view_nested_view ) -{ - ::Test::view_nested_view< Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_auto_1d_left ) { - TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_auto_1d_right ) { - TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_auto_1d_stride ) { - TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_assign_strided ) { - TestViewSubview::test_1d_strided_assignment< Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_left_0 ) { - TestViewSubview::test_left_0< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_left_1 ) { - TestViewSubview::test_left_1< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_left_2 ) { - TestViewSubview::test_left_2< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_left_3 ) { - TestViewSubview::test_left_3< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_right_0 ) { - TestViewSubview::test_right_0< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_right_1 ) { - TestViewSubview::test_right_1< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_right_3 ) { - TestViewSubview::test_right_3< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_1d_assign ) { - TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_2d_from_3d ) { - TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_2d_from_5d ) { - TestViewSubview::test_2d_subview_5d< Kokkos::CudaUVMSpace >(); -} - - -TEST_F( cuda, range_tag ) -{ - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001); - //TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000); -} - -TEST_F( cuda, team_tag ) -{ - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000); - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000); - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000); -} - -TEST_F( cuda, reduce ) -{ - TestReduce< long , Kokkos::Cuda >( 10000000 ); - TestReduce< double , Kokkos::Cuda >( 1000000 ); - TestReduce< int , Kokkos::Cuda >( 0 ); -} - -TEST_F( cuda, reduce_team ) -{ - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 ); - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 ); - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 ); - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 ); -} - -TEST_F( cuda, shared_team ) -{ - TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >(); - TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >(); -} - - -#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) -TEST_F( cuda, lambda_shared_team ) -{ - TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >(); - TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >(); - TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >(); - TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >(); - TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >(); - TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >(); -} -#endif - - -TEST_F( cuda, reduce_dynamic ) -{ - TestReduceDynamic< long , Kokkos::Cuda >( 10000000 ); - TestReduceDynamic< double , Kokkos::Cuda >( 1000000 ); -} - -TEST_F( cuda, reduce_dynamic_view ) -{ - TestReduceDynamicView< long , Kokkos::Cuda >( 10000000 ); - TestReduceDynamicView< double , Kokkos::Cuda >( 1000000 ); -} - -TEST_F( cuda, atomic ) -{ - const int loop_count = 1e3 ; - - ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,3) ) ); - -} - -//---------------------------------------------------------------------------- - -TEST_F( cuda, tile_layout) -{ - TestTile::test< Kokkos::Cuda , 1 , 1 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 1 , 1 >( 2 , 3 ); - TestTile::test< Kokkos::Cuda , 1 , 1 >( 9 , 10 ); - - TestTile::test< Kokkos::Cuda , 2 , 2 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 2 , 2 >( 2 , 3 ); - TestTile::test< Kokkos::Cuda , 2 , 2 >( 4 , 4 ); - TestTile::test< Kokkos::Cuda , 2 , 2 >( 9 , 9 ); - - TestTile::test< Kokkos::Cuda , 2 , 4 >( 9 , 9 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 ); - - TestTile::test< Kokkos::Cuda , 4 , 4 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 4 , 4 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 11 ); - - TestTile::test< Kokkos::Cuda , 8 , 8 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 8 , 8 >( 4 , 4 ); - TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 9 ); - TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 11 ); -} - -TEST_F( cuda , view_aggregate ) -{ - TestViewAggregate< Kokkos::Cuda >(); - TestViewAggregateReduction< Kokkos::Cuda >(); -} - - -TEST_F( cuda , scan ) -{ - TestScan< Kokkos::Cuda >::test_range( 1 , 1000 ); - TestScan< Kokkos::Cuda >( 1000000 ); - TestScan< Kokkos::Cuda >( 10000000 ); - - TestScan< Kokkos::Cuda >( 0 ); - TestScan< Kokkos::Cuda >( 0 , 0 ); - - Kokkos::Cuda::fence(); -} - -TEST_F( cuda , team_scan ) -{ - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10 ); - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10 ); - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10000 ); - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 ); -} - -TEST_F( cuda , memory_pool ) -{ - bool val_uvm = TestMemoryPool::test_mempool< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 128000 ); - ASSERT_TRUE( val_uvm ); - - Kokkos::Cuda::fence(); - - TestMemoryPool::test_mempool2< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 2560000 ); - - Kokkos::Cuda::fence(); -} - -} - -//---------------------------------------------------------------------------- - -TEST_F( cuda , template_meta_functions ) -{ - TestTemplateMetaFunctions<int, Kokkos::Cuda >(); -} - -//---------------------------------------------------------------------------- - -namespace Test { - -TEST_F( cuda , reduction_deduction ) -{ - TestCXX11::test_reduction_deduction< Kokkos::Cuda >(); -} - -TEST_F( cuda , team_vector ) -{ - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(0) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(1) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(2) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(3) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(4) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(5) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(6) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(7) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(8) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(9) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) ); -} -*/ -} - -//---------------------------------------------------------------------------- -/* -#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) -TEST_F( cuda , task_policy ) -{ - TestTaskPolicy::test_task_dep< Kokkos::Cuda >( 10 ); - - for ( long i = 0 ; i < 15 ; ++i ) { - // printf("TestTaskPolicy::test_fib< Kokkos::Cuda >(%d);\n",i); - TestTaskPolicy::test_fib< Kokkos::Cuda >(i,4096); - } - for ( long i = 0 ; i < 35 ; ++i ) { - // printf("TestTaskPolicy::test_fib2< Kokkos::Cuda >(%d);\n",i); - TestTaskPolicy::test_fib2< Kokkos::Cuda >(i,4096); - } } - -TEST_F( cuda , task_team ) -{ - TestTaskPolicy::test_task_team< Kokkos::Cuda >(1000); -} - -TEST_F( cuda , task_latch ) -{ - TestTaskPolicy::test_latch< Kokkos::Cuda >(10); - TestTaskPolicy::test_latch< Kokkos::Cuda >(1000); -} - -#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */ - diff --git a/lib/kokkos/core/unit_test/TestCuda_a.cpp b/lib/kokkos/core/unit_test/TestCuda_a.cpp index 05716153d1..4680c33386 100644 --- a/lib/kokkos/core/unit_test/TestCuda_a.cpp +++ b/lib/kokkos/core/unit_test/TestCuda_a.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -85,199 +85,13 @@ class cuda : public ::testing::Test { protected: - static void SetUpTestCase() - { - Kokkos::Cuda::print_configuration( std::cout ); - Kokkos::HostSpace::execution_space::initialize(); - Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) ); - } - static void TearDownTestCase() - { - Kokkos::Cuda::finalize(); - Kokkos::HostSpace::execution_space::finalize(); - } + static void SetUpTestCase(); + static void TearDownTestCase(); }; //---------------------------------------------------------------------------- namespace Test { -/* -__global__ -void test_abort() -{ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - Kokkos::CudaSpace , - Kokkos::HostSpace >::verify(); -} - -__global__ -void test_cuda_spaces_int_value( int * ptr ) -{ - if ( *ptr == 42 ) { *ptr = 2 * 42 ; } -} - - -TEST_F( cuda , compiler_macros ) -{ - ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Cuda >() ) ); -} - -TEST_F( cuda , memory_space ) -{ - TestMemorySpace< Kokkos::Cuda >(); -} - -TEST_F( cuda, uvm ) -{ - if ( Kokkos::CudaUVMSpace::available() ) { - - int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >("uvm_ptr",sizeof(int)); - - *uvm_ptr = 42 ; - - Kokkos::Cuda::fence(); - test_cuda_spaces_int_value<<<1,1>>>(uvm_ptr); - Kokkos::Cuda::fence(); - - EXPECT_EQ( *uvm_ptr, int(2*42) ); - - Kokkos::kokkos_free< Kokkos::CudaUVMSpace >(uvm_ptr ); - } -} - -//---------------------------------------------------------------------------- - -TEST_F( cuda , impl_shared_alloc ) -{ - test_shared_alloc< Kokkos::CudaSpace , Kokkos::HostSpace::execution_space >(); - test_shared_alloc< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >(); - test_shared_alloc< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >(); -} - -TEST_F( cuda, policy_construction) { - TestRangePolicyConstruction< Kokkos::Cuda >(); - TestTeamPolicyConstruction< Kokkos::Cuda >(); -} - -TEST_F( cuda , impl_view_mapping ) -{ - test_view_mapping< Kokkos::Cuda >(); - test_view_mapping< Kokkos::CudaUVMSpace >(); - test_view_mapping_subview< Kokkos::Cuda >(); - test_view_mapping_subview< Kokkos::CudaUVMSpace >(); - test_view_mapping_operator< Kokkos::Cuda >(); - test_view_mapping_operator< Kokkos::CudaUVMSpace >(); - TestViewMappingAtomic< Kokkos::Cuda >::run(); -} - -TEST_F( cuda , view_of_class ) -{ - TestViewMappingClassValue< Kokkos::CudaSpace >::run(); - TestViewMappingClassValue< Kokkos::CudaUVMSpace >::run(); -} - -template< class MemSpace > -struct TestViewCudaTexture { - - enum { N = 1000 }; - - using V = Kokkos::Experimental::View<double*,MemSpace> ; - using T = Kokkos::Experimental::View<const double*, MemSpace, Kokkos::MemoryRandomAccess > ; - - V m_base ; - T m_tex ; - - struct TagInit {}; - struct TagTest {}; - - KOKKOS_INLINE_FUNCTION - void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; } - - KOKKOS_INLINE_FUNCTION - void operator()( const TagTest & , const int i , long & error_count ) const - { if ( m_tex[i] != i + 1 ) ++error_count ; } - - TestViewCudaTexture() - : m_base("base",N) - , m_tex( m_base ) - {} - - static void run() - { - EXPECT_TRUE( ( std::is_same< typename V::reference_type - , double & - >::value ) ); - - EXPECT_TRUE( ( std::is_same< typename T::reference_type - , const double - >::value ) ); - - EXPECT_TRUE( V::reference_type_is_lvalue_reference ); // An ordinary view - EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value - - TestViewCudaTexture self ; - Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda , TagInit >(0,N) , self ); - long error_count = -1 ; - Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda , TagTest >(0,N) , self , error_count ); - EXPECT_EQ( error_count , 0 ); - } -}; - - -TEST_F( cuda , impl_view_texture ) -{ - TestViewCudaTexture< Kokkos::CudaSpace >::run(); - TestViewCudaTexture< Kokkos::CudaUVMSpace >::run(); -} - -template< class MemSpace , class ExecSpace > -struct TestViewCudaAccessible { - - enum { N = 1000 }; - - using V = Kokkos::Experimental::View<double*,MemSpace> ; - - V m_base ; - - struct TagInit {}; - struct TagTest {}; - - KOKKOS_INLINE_FUNCTION - void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; } - - KOKKOS_INLINE_FUNCTION - void operator()( const TagTest & , const int i , long & error_count ) const - { if ( m_base[i] != i + 1 ) ++error_count ; } - - TestViewCudaAccessible() - : m_base("base",N) - {} - - static void run() - { - TestViewCudaAccessible self ; - Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space , TagInit >(0,N) , self ); - MemSpace::execution_space::fence(); - // Next access is a different execution space, must complete prior kernel. - long error_count = -1 ; - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagTest >(0,N) , self , error_count ); - EXPECT_EQ( error_count , 0 ); - } -}; - - -TEST_F( cuda , impl_view_accessible ) -{ - TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run(); - - TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::Cuda >::run(); - TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >::run(); - - TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run(); - TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run(); -} -*/ -//---------------------------------------------------------------------------- TEST_F( cuda, view_impl ) { @@ -304,7 +118,6 @@ TEST_F( cuda, view_api ) #endif } - TEST_F( cuda , view_nested_view ) { ::Test::view_nested_view< Kokkos::Cuda >(); @@ -366,248 +179,4 @@ TEST_F( cuda, view_subview_2d_from_5d ) { TestViewSubview::test_2d_subview_5d< Kokkos::CudaUVMSpace >(); } -/* -TEST_F( cuda, range_tag ) -{ - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001); - //TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000); -} - -TEST_F( cuda, team_tag ) -{ - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000); - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000); - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000); -} - -TEST_F( cuda, reduce ) -{ - TestReduce< long , Kokkos::Cuda >( 10000000 ); - TestReduce< double , Kokkos::Cuda >( 1000000 ); - TestReduce< int , Kokkos::Cuda >( 0 ); -} - -TEST_F( cuda, reduce_team ) -{ - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 ); - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 ); - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 ); - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 ); } - -TEST_F( cuda, shared_team ) -{ - TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >(); - TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >(); -} - - -#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) -TEST_F( cuda, lambda_shared_team ) -{ - TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >(); - TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >(); - TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >(); - TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >(); - TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >(); - TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >(); -} -#endif - - -TEST_F( cuda, reduce_dynamic ) -{ - TestReduceDynamic< long , Kokkos::Cuda >( 10000000 ); - TestReduceDynamic< double , Kokkos::Cuda >( 1000000 ); -} - -TEST_F( cuda, reduce_dynamic_view ) -{ - TestReduceDynamicView< long , Kokkos::Cuda >( 10000000 ); - TestReduceDynamicView< double , Kokkos::Cuda >( 1000000 ); -} - -TEST_F( cuda, atomic ) -{ - const int loop_count = 1e3 ; - - ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,3) ) ); - -} - -//---------------------------------------------------------------------------- - -TEST_F( cuda, tile_layout) -{ - TestTile::test< Kokkos::Cuda , 1 , 1 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 1 , 1 >( 2 , 3 ); - TestTile::test< Kokkos::Cuda , 1 , 1 >( 9 , 10 ); - - TestTile::test< Kokkos::Cuda , 2 , 2 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 2 , 2 >( 2 , 3 ); - TestTile::test< Kokkos::Cuda , 2 , 2 >( 4 , 4 ); - TestTile::test< Kokkos::Cuda , 2 , 2 >( 9 , 9 ); - - TestTile::test< Kokkos::Cuda , 2 , 4 >( 9 , 9 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 ); - - TestTile::test< Kokkos::Cuda , 4 , 4 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 4 , 4 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 11 ); - - TestTile::test< Kokkos::Cuda , 8 , 8 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 8 , 8 >( 4 , 4 ); - TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 9 ); - TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 11 ); -} - -TEST_F( cuda , view_aggregate ) -{ - TestViewAggregate< Kokkos::Cuda >(); - TestViewAggregateReduction< Kokkos::Cuda >(); -} - - -TEST_F( cuda , scan ) -{ - TestScan< Kokkos::Cuda >::test_range( 1 , 1000 ); - TestScan< Kokkos::Cuda >( 1000000 ); - TestScan< Kokkos::Cuda >( 10000000 ); - - TestScan< Kokkos::Cuda >( 0 ); - TestScan< Kokkos::Cuda >( 0 , 0 ); - - Kokkos::Cuda::fence(); -} - -TEST_F( cuda , team_scan ) -{ - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10 ); - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10 ); - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10000 ); - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 ); -} - -TEST_F( cuda , memory_pool ) -{ - bool val_uvm = TestMemoryPool::test_mempool< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 128000 ); - ASSERT_TRUE( val_uvm ); - - Kokkos::Cuda::fence(); - - TestMemoryPool::test_mempool2< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 2560000 ); - - Kokkos::Cuda::fence(); -} - -} - -//---------------------------------------------------------------------------- - -TEST_F( cuda , template_meta_functions ) -{ - TestTemplateMetaFunctions<int, Kokkos::Cuda >(); -} - -//---------------------------------------------------------------------------- - -namespace Test { - -TEST_F( cuda , reduction_deduction ) -{ - TestCXX11::test_reduction_deduction< Kokkos::Cuda >(); -} - -TEST_F( cuda , team_vector ) -{ - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(0) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(1) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(2) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(3) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(4) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(5) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(6) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(7) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(8) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(9) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) ); -} -*/ -} - -//---------------------------------------------------------------------------- -/* -#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) - -TEST_F( cuda , task_policy ) -{ - TestTaskPolicy::test_task_dep< Kokkos::Cuda >( 10 ); - - for ( long i = 0 ; i < 15 ; ++i ) { - // printf("TestTaskPolicy::test_fib< Kokkos::Cuda >(%d);\n",i); - TestTaskPolicy::test_fib< Kokkos::Cuda >(i,4096); - } - for ( long i = 0 ; i < 35 ; ++i ) { - // printf("TestTaskPolicy::test_fib2< Kokkos::Cuda >(%d);\n",i); - TestTaskPolicy::test_fib2< Kokkos::Cuda >(i,4096); - } -} - -TEST_F( cuda , task_team ) -{ - TestTaskPolicy::test_task_team< Kokkos::Cuda >(1000); -} - -TEST_F( cuda , task_latch ) -{ - TestTaskPolicy::test_latch< Kokkos::Cuda >(10); - TestTaskPolicy::test_latch< Kokkos::Cuda >(1000); -} - -#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */ - diff --git a/lib/kokkos/core/unit_test/TestCuda_b.cpp b/lib/kokkos/core/unit_test/TestCuda_b.cpp index 3d57347bb8..d4ca949e57 100644 --- a/lib/kokkos/core/unit_test/TestCuda_b.cpp +++ b/lib/kokkos/core/unit_test/TestCuda_b.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -85,290 +85,22 @@ class cuda : public ::testing::Test { protected: - static void SetUpTestCase() - { - Kokkos::Cuda::print_configuration( std::cout ); - Kokkos::HostSpace::execution_space::initialize(); - Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) ); - } - static void TearDownTestCase() - { - Kokkos::Cuda::finalize(); - Kokkos::HostSpace::execution_space::finalize(); - } + static void SetUpTestCase(); + static void TearDownTestCase(); }; //---------------------------------------------------------------------------- namespace Test { -/* -__global__ -void test_abort() -{ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - Kokkos::CudaSpace , - Kokkos::HostSpace >::verify(); -} - -__global__ -void test_cuda_spaces_int_value( int * ptr ) -{ - if ( *ptr == 42 ) { *ptr = 2 * 42 ; } -} - - -TEST_F( cuda , compiler_macros ) -{ - ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Cuda >() ) ); -} - -TEST_F( cuda , memory_space ) -{ - TestMemorySpace< Kokkos::Cuda >(); -} - -TEST_F( cuda, uvm ) -{ - if ( Kokkos::CudaUVMSpace::available() ) { - - int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >("uvm_ptr",sizeof(int)); - - *uvm_ptr = 42 ; - - Kokkos::Cuda::fence(); - test_cuda_spaces_int_value<<<1,1>>>(uvm_ptr); - Kokkos::Cuda::fence(); - - EXPECT_EQ( *uvm_ptr, int(2*42) ); - - Kokkos::kokkos_free< Kokkos::CudaUVMSpace >(uvm_ptr ); - } -} - -//---------------------------------------------------------------------------- - -TEST_F( cuda , impl_shared_alloc ) -{ - test_shared_alloc< Kokkos::CudaSpace , Kokkos::HostSpace::execution_space >(); - test_shared_alloc< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >(); - test_shared_alloc< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >(); -} - -TEST_F( cuda, policy_construction) { - TestRangePolicyConstruction< Kokkos::Cuda >(); - TestTeamPolicyConstruction< Kokkos::Cuda >(); -} - -TEST_F( cuda , impl_view_mapping ) -{ - test_view_mapping< Kokkos::Cuda >(); - test_view_mapping< Kokkos::CudaUVMSpace >(); - test_view_mapping_subview< Kokkos::Cuda >(); - test_view_mapping_subview< Kokkos::CudaUVMSpace >(); - test_view_mapping_operator< Kokkos::Cuda >(); - test_view_mapping_operator< Kokkos::CudaUVMSpace >(); - TestViewMappingAtomic< Kokkos::Cuda >::run(); -} - -TEST_F( cuda , view_of_class ) -{ - TestViewMappingClassValue< Kokkos::CudaSpace >::run(); - TestViewMappingClassValue< Kokkos::CudaUVMSpace >::run(); -} - -template< class MemSpace > -struct TestViewCudaTexture { - - enum { N = 1000 }; - - using V = Kokkos::Experimental::View<double*,MemSpace> ; - using T = Kokkos::Experimental::View<const double*, MemSpace, Kokkos::MemoryRandomAccess > ; - - V m_base ; - T m_tex ; - - struct TagInit {}; - struct TagTest {}; - - KOKKOS_INLINE_FUNCTION - void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; } - - KOKKOS_INLINE_FUNCTION - void operator()( const TagTest & , const int i , long & error_count ) const - { if ( m_tex[i] != i + 1 ) ++error_count ; } - - TestViewCudaTexture() - : m_base("base",N) - , m_tex( m_base ) - {} - - static void run() - { - EXPECT_TRUE( ( std::is_same< typename V::reference_type - , double & - >::value ) ); - - EXPECT_TRUE( ( std::is_same< typename T::reference_type - , const double - >::value ) ); - - EXPECT_TRUE( V::reference_type_is_lvalue_reference ); // An ordinary view - EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value - - TestViewCudaTexture self ; - Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda , TagInit >(0,N) , self ); - long error_count = -1 ; - Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda , TagTest >(0,N) , self , error_count ); - EXPECT_EQ( error_count , 0 ); - } -}; - - -TEST_F( cuda , impl_view_texture ) -{ - TestViewCudaTexture< Kokkos::CudaSpace >::run(); - TestViewCudaTexture< Kokkos::CudaUVMSpace >::run(); -} - -template< class MemSpace , class ExecSpace > -struct TestViewCudaAccessible { - - enum { N = 1000 }; - - using V = Kokkos::Experimental::View<double*,MemSpace> ; - - V m_base ; - - struct TagInit {}; - struct TagTest {}; - - KOKKOS_INLINE_FUNCTION - void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; } - - KOKKOS_INLINE_FUNCTION - void operator()( const TagTest & , const int i , long & error_count ) const - { if ( m_base[i] != i + 1 ) ++error_count ; } - - TestViewCudaAccessible() - : m_base("base",N) - {} - - static void run() - { - TestViewCudaAccessible self ; - Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space , TagInit >(0,N) , self ); - MemSpace::execution_space::fence(); - // Next access is a different execution space, must complete prior kernel. - long error_count = -1 ; - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagTest >(0,N) , self , error_count ); - EXPECT_EQ( error_count , 0 ); - } -}; - - -TEST_F( cuda , impl_view_accessible ) -{ - TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run(); - - TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::Cuda >::run(); - TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >::run(); - - TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run(); - TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run(); -} - -//---------------------------------------------------------------------------- - -TEST_F( cuda, view_impl ) -{ - // test_abort<<<32,32>>>(); // Aborts the kernel with CUDA version 4.1 or greater - - test_view_impl< Kokkos::Cuda >(); -} - -TEST_F( cuda, view_api ) -{ - typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess > > view_texture_managed ; - typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess | Kokkos::Unmanaged > > view_texture_unmanaged ; - - TestViewAPI< double , Kokkos::Cuda >(); - TestViewAPI< double , Kokkos::CudaUVMSpace >(); - -#if 0 - Kokkos::View<double, Kokkos::Cuda > x("x"); - Kokkos::View<double[1], Kokkos::Cuda > y("y"); - // *x = 10 ; - // x() = 10 ; - // y[0] = 10 ; - // y(0) = 10 ; -#endif -} - - -TEST_F( cuda , view_nested_view ) -{ - ::Test::view_nested_view< Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_auto_1d_left ) { - TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_auto_1d_right ) { - TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_auto_1d_stride ) { - TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_assign_strided ) { - TestViewSubview::test_1d_strided_assignment< Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_left_0 ) { - TestViewSubview::test_left_0< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_left_1 ) { - TestViewSubview::test_left_1< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_left_2 ) { - TestViewSubview::test_left_2< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_left_3 ) { - TestViewSubview::test_left_3< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_right_0 ) { - TestViewSubview::test_right_0< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_right_1 ) { - TestViewSubview::test_right_1< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_right_3 ) { - TestViewSubview::test_right_3< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_1d_assign ) { - TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_2d_from_3d ) { - TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_2d_from_5d ) { - TestViewSubview::test_2d_subview_5d< Kokkos::CudaUVMSpace >(); -} -*/ TEST_F( cuda, range_tag ) { + TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(3); + TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(3); + TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(3); + TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3); + TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3); + TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3); TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000); TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000); @@ -380,6 +112,10 @@ TEST_F( cuda, range_tag ) TEST_F( cuda, team_tag ) { + TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(3); + TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(3); + TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3); + TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3); TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000); TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000); @@ -393,6 +129,14 @@ TEST_F( cuda, reduce ) TestReduce< int , Kokkos::Cuda >( 0 ); } +TEST_F( cuda , reducers ) +{ + TestReducers<int, Kokkos::Cuda>::execute_integer(); + TestReducers<size_t, Kokkos::Cuda>::execute_integer(); + TestReducers<double, Kokkos::Cuda>::execute_float(); + TestReducers<Kokkos::complex<double>, Kokkos::Cuda>::execute_basic(); +} + TEST_F( cuda, reduce_team ) { TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 ); @@ -411,7 +155,6 @@ TEST_F( cuda, shared_team ) TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >(); } - #if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) TEST_F( cuda, lambda_shared_team ) { @@ -424,6 +167,14 @@ TEST_F( cuda, lambda_shared_team ) } #endif +TEST_F( cuda, shmem_size) { + TestShmemSize< Kokkos::Cuda >(); +} + +TEST_F( cuda, multi_level_scratch) { + TestMultiLevelScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >(); + TestMultiLevelScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >(); +} TEST_F( cuda, reduce_dynamic ) { @@ -436,178 +187,5 @@ TEST_F( cuda, reduce_dynamic_view ) TestReduceDynamicView< long , Kokkos::Cuda >( 10000000 ); TestReduceDynamicView< double , Kokkos::Cuda >( 1000000 ); } -/* -TEST_F( cuda, atomic ) -{ - const int loop_count = 1e3 ; - - ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,3) ) ); - - ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,1) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,2) ) ); - ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,3) ) ); - -} - -//---------------------------------------------------------------------------- - -TEST_F( cuda, tile_layout) -{ - TestTile::test< Kokkos::Cuda , 1 , 1 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 1 , 1 >( 2 , 3 ); - TestTile::test< Kokkos::Cuda , 1 , 1 >( 9 , 10 ); - - TestTile::test< Kokkos::Cuda , 2 , 2 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 2 , 2 >( 2 , 3 ); - TestTile::test< Kokkos::Cuda , 2 , 2 >( 4 , 4 ); - TestTile::test< Kokkos::Cuda , 2 , 2 >( 9 , 9 ); - TestTile::test< Kokkos::Cuda , 2 , 4 >( 9 , 9 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 ); - - TestTile::test< Kokkos::Cuda , 4 , 4 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 4 , 4 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 ); - TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 11 ); - - TestTile::test< Kokkos::Cuda , 8 , 8 >( 1 , 1 ); - TestTile::test< Kokkos::Cuda , 8 , 8 >( 4 , 4 ); - TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 9 ); - TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 11 ); -} - -TEST_F( cuda , view_aggregate ) -{ - TestViewAggregate< Kokkos::Cuda >(); - TestViewAggregateReduction< Kokkos::Cuda >(); } - - -TEST_F( cuda , scan ) -{ - TestScan< Kokkos::Cuda >::test_range( 1 , 1000 ); - TestScan< Kokkos::Cuda >( 1000000 ); - TestScan< Kokkos::Cuda >( 10000000 ); - - TestScan< Kokkos::Cuda >( 0 ); - TestScan< Kokkos::Cuda >( 0 , 0 ); - - Kokkos::Cuda::fence(); -} - -TEST_F( cuda , team_scan ) -{ - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10 ); - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10 ); - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10000 ); - TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 ); -} - -TEST_F( cuda , memory_pool ) -{ - bool val_uvm = TestMemoryPool::test_mempool< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 128000 ); - ASSERT_TRUE( val_uvm ); - - Kokkos::Cuda::fence(); - - TestMemoryPool::test_mempool2< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 2560000 ); - - Kokkos::Cuda::fence(); -} - -} - -//---------------------------------------------------------------------------- - -TEST_F( cuda , template_meta_functions ) -{ - TestTemplateMetaFunctions<int, Kokkos::Cuda >(); -} - -//---------------------------------------------------------------------------- - -namespace Test { - -TEST_F( cuda , reduction_deduction ) -{ - TestCXX11::test_reduction_deduction< Kokkos::Cuda >(); -} - -TEST_F( cuda , team_vector ) -{ - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(0) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(1) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(2) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(3) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(4) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(5) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(6) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(7) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(8) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(9) ) ); - ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) ); -} -*/ -} - -//---------------------------------------------------------------------------- -/* -#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) - -TEST_F( cuda , task_policy ) -{ - TestTaskPolicy::test_task_dep< Kokkos::Cuda >( 10 ); - - for ( long i = 0 ; i < 15 ; ++i ) { - // printf("TestTaskPolicy::test_fib< Kokkos::Cuda >(%d);\n",i); - TestTaskPolicy::test_fib< Kokkos::Cuda >(i,4096); - } - for ( long i = 0 ; i < 35 ; ++i ) { - // printf("TestTaskPolicy::test_fib2< Kokkos::Cuda >(%d);\n",i); - TestTaskPolicy::test_fib2< Kokkos::Cuda >(i,4096); - } -} - -TEST_F( cuda , task_team ) -{ - TestTaskPolicy::test_task_team< Kokkos::Cuda >(1000); -} - -TEST_F( cuda , task_latch ) -{ - TestTaskPolicy::test_latch< Kokkos::Cuda >(10); - TestTaskPolicy::test_latch< Kokkos::Cuda >(1000); -} - -#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */ - diff --git a/lib/kokkos/core/unit_test/TestCuda_c.cpp b/lib/kokkos/core/unit_test/TestCuda_c.cpp index 24635959c5..70584cead1 100644 --- a/lib/kokkos/core/unit_test/TestCuda_c.cpp +++ b/lib/kokkos/core/unit_test/TestCuda_c.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -60,6 +60,7 @@ #include <TestViewImpl.hpp> #include <TestAtomic.hpp> +#include <TestAtomicOperations.hpp> #include <TestViewAPI.hpp> #include <TestViewSubview.hpp> @@ -85,358 +86,14 @@ class cuda : public ::testing::Test { protected: - static void SetUpTestCase() - { - Kokkos::Cuda::print_configuration( std::cout ); - Kokkos::HostSpace::execution_space::initialize(); - Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) ); - } - static void TearDownTestCase() - { - Kokkos::Cuda::finalize(); - Kokkos::HostSpace::execution_space::finalize(); - } + static void SetUpTestCase(); + static void TearDownTestCase(); }; //---------------------------------------------------------------------------- namespace Test { -/* -__global__ -void test_abort() -{ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - Kokkos::CudaSpace , - Kokkos::HostSpace >::verify(); -} - -__global__ -void test_cuda_spaces_int_value( int * ptr ) -{ - if ( *ptr == 42 ) { *ptr = 2 * 42 ; } -} - - -TEST_F( cuda , compiler_macros ) -{ - ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Cuda >() ) ); -} - -TEST_F( cuda , memory_space ) -{ - TestMemorySpace< Kokkos::Cuda >(); -} - -TEST_F( cuda, uvm ) -{ - if ( Kokkos::CudaUVMSpace::available() ) { - - int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >("uvm_ptr",sizeof(int)); - - *uvm_ptr = 42 ; - - Kokkos::Cuda::fence(); - test_cuda_spaces_int_value<<<1,1>>>(uvm_ptr); - Kokkos::Cuda::fence(); - - EXPECT_EQ( *uvm_ptr, int(2*42) ); - - Kokkos::kokkos_free< Kokkos::CudaUVMSpace >(uvm_ptr ); - } -} - -//---------------------------------------------------------------------------- - -TEST_F( cuda , impl_shared_alloc ) -{ - test_shared_alloc< Kokkos::CudaSpace , Kokkos::HostSpace::execution_space >(); - test_shared_alloc< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >(); - test_shared_alloc< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >(); -} - -TEST_F( cuda, policy_construction) { - TestRangePolicyConstruction< Kokkos::Cuda >(); - TestTeamPolicyConstruction< Kokkos::Cuda >(); -} - -TEST_F( cuda , impl_view_mapping ) -{ - test_view_mapping< Kokkos::Cuda >(); - test_view_mapping< Kokkos::CudaUVMSpace >(); - test_view_mapping_subview< Kokkos::Cuda >(); - test_view_mapping_subview< Kokkos::CudaUVMSpace >(); - test_view_mapping_operator< Kokkos::Cuda >(); - test_view_mapping_operator< Kokkos::CudaUVMSpace >(); - TestViewMappingAtomic< Kokkos::Cuda >::run(); -} - -TEST_F( cuda , view_of_class ) -{ - TestViewMappingClassValue< Kokkos::CudaSpace >::run(); - TestViewMappingClassValue< Kokkos::CudaUVMSpace >::run(); -} - -template< class MemSpace > -struct TestViewCudaTexture { - enum { N = 1000 }; - - using V = Kokkos::Experimental::View<double*,MemSpace> ; - using T = Kokkos::Experimental::View<const double*, MemSpace, Kokkos::MemoryRandomAccess > ; - - V m_base ; - T m_tex ; - - struct TagInit {}; - struct TagTest {}; - - KOKKOS_INLINE_FUNCTION - void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; } - - KOKKOS_INLINE_FUNCTION - void operator()( const TagTest & , const int i , long & error_count ) const - { if ( m_tex[i] != i + 1 ) ++error_count ; } - - TestViewCudaTexture() - : m_base("base",N) - , m_tex( m_base ) - {} - - static void run() - { - EXPECT_TRUE( ( std::is_same< typename V::reference_type - , double & - >::value ) ); - - EXPECT_TRUE( ( std::is_same< typename T::reference_type - , const double - >::value ) ); - - EXPECT_TRUE( V::reference_type_is_lvalue_reference ); // An ordinary view - EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value - - TestViewCudaTexture self ; - Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda , TagInit >(0,N) , self ); - long error_count = -1 ; - Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda , TagTest >(0,N) , self , error_count ); - EXPECT_EQ( error_count , 0 ); - } -}; - - -TEST_F( cuda , impl_view_texture ) -{ - TestViewCudaTexture< Kokkos::CudaSpace >::run(); - TestViewCudaTexture< Kokkos::CudaUVMSpace >::run(); -} - -template< class MemSpace , class ExecSpace > -struct TestViewCudaAccessible { - - enum { N = 1000 }; - - using V = Kokkos::Experimental::View<double*,MemSpace> ; - - V m_base ; - - struct TagInit {}; - struct TagTest {}; - - KOKKOS_INLINE_FUNCTION - void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; } - - KOKKOS_INLINE_FUNCTION - void operator()( const TagTest & , const int i , long & error_count ) const - { if ( m_base[i] != i + 1 ) ++error_count ; } - - TestViewCudaAccessible() - : m_base("base",N) - {} - - static void run() - { - TestViewCudaAccessible self ; - Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space , TagInit >(0,N) , self ); - MemSpace::execution_space::fence(); - // Next access is a different execution space, must complete prior kernel. - long error_count = -1 ; - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagTest >(0,N) , self , error_count ); - EXPECT_EQ( error_count , 0 ); - } -}; - - -TEST_F( cuda , impl_view_accessible ) -{ - TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run(); - - TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::Cuda >::run(); - TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >::run(); - - TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run(); - TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run(); -} - -//---------------------------------------------------------------------------- - -TEST_F( cuda, view_impl ) -{ - // test_abort<<<32,32>>>(); // Aborts the kernel with CUDA version 4.1 or greater - - test_view_impl< Kokkos::Cuda >(); -} - -TEST_F( cuda, view_api ) -{ - typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess > > view_texture_managed ; - typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess | Kokkos::Unmanaged > > view_texture_unmanaged ; - - TestViewAPI< double , Kokkos::Cuda >(); - TestViewAPI< double , Kokkos::CudaUVMSpace >(); - -#if 0 - Kokkos::View<double, Kokkos::Cuda > x("x"); - Kokkos::View<double[1], Kokkos::Cuda > y("y"); - // *x = 10 ; - // x() = 10 ; - // y[0] = 10 ; - // y(0) = 10 ; -#endif -} - - -TEST_F( cuda , view_nested_view ) -{ - ::Test::view_nested_view< Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_auto_1d_left ) { - TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_auto_1d_right ) { - TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_auto_1d_stride ) { - TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_assign_strided ) { - TestViewSubview::test_1d_strided_assignment< Kokkos::Cuda >(); -} - -TEST_F( cuda, view_subview_left_0 ) { - TestViewSubview::test_left_0< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_left_1 ) { - TestViewSubview::test_left_1< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_left_2 ) { - TestViewSubview::test_left_2< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_left_3 ) { - TestViewSubview::test_left_3< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_right_0 ) { - TestViewSubview::test_right_0< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_right_1 ) { - TestViewSubview::test_right_1< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_right_3 ) { - TestViewSubview::test_right_3< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_1d_assign ) { - TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_2d_from_3d ) { - TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace >(); -} - -TEST_F( cuda, view_subview_2d_from_5d ) { - TestViewSubview::test_2d_subview_5d< Kokkos::CudaUVMSpace >(); -} - - -TEST_F( cuda, range_tag ) -{ - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001); - TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001); - //TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000); -} - -TEST_F( cuda, team_tag ) -{ - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000); - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000); - TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000); -} - -TEST_F( cuda, reduce ) -{ - TestReduce< long , Kokkos::Cuda >( 10000000 ); - TestReduce< double , Kokkos::Cuda >( 1000000 ); - TestReduce< int , Kokkos::Cuda >( 0 ); -} - -TEST_F( cuda, reduce_team ) -{ - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 ); - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 ); - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 ); - TestReduceTeam< long , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 ); - TestReduceTeam< double , Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 ); -} - -TEST_F( cuda, shared_team ) -{ - TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >(); - TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >(); -} - - -#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) -TEST_F( cuda, lambda_shared_team ) -{ - TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >(); - TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >(); - TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >(); - TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >(); - TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >(); - TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >(); -} -#endif - - -TEST_F( cuda, reduce_dynamic ) -{ - TestReduceDynamic< long , Kokkos::Cuda >( 10000000 ); - TestReduceDynamic< double , Kokkos::Cuda >( 1000000 ); -} - -TEST_F( cuda, reduce_dynamic_view ) -{ - TestReduceDynamicView< long , Kokkos::Cuda >( 10000000 ); - TestReduceDynamicView< double , Kokkos::Cuda >( 1000000 ); -} -*/ TEST_F( cuda, atomic ) { const int loop_count = 1e3 ; @@ -479,6 +136,75 @@ TEST_F( cuda, atomic ) } +TEST_F( cuda , atomic_operations ) +{ + const int start = 1; //Avoid zero for division + const int end = 11; + for (int i = start; i < end; ++i) + { + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 4 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 4 ) ) ); + } + +} + //---------------------------------------------------------------------------- TEST_F( cuda, tile_layout) @@ -512,7 +238,6 @@ TEST_F( cuda , view_aggregate ) TestViewAggregateReduction< Kokkos::Cuda >(); } - TEST_F( cuda , scan ) { TestScan< Kokkos::Cuda >::test_range( 1 , 1000 ); @@ -535,12 +260,19 @@ TEST_F( cuda , team_scan ) TEST_F( cuda , memory_pool ) { - bool val_uvm = TestMemoryPool::test_mempool< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 128000 ); - ASSERT_TRUE( val_uvm ); +// typedef Kokkos::CudaUVMSpace device_type; + typedef Kokkos::Cuda device_type; + + bool val = TestMemoryPool::test_mempool< device_type >( 128, 128000000 ); + ASSERT_TRUE( val ); Kokkos::Cuda::fence(); - TestMemoryPool::test_mempool2< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 2560000 ); + TestMemoryPool::test_mempool2< device_type >( 64, 4, 100000, 200000 ); + + Kokkos::Cuda::fence(); + + TestMemoryPool::test_memory_exhaustion< Kokkos::Cuda >(); Kokkos::Cuda::fence(); } @@ -578,13 +310,43 @@ TEST_F( cuda , team_vector ) ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) ); } +TEST_F( cuda, triple_nested_parallelism ) +{ + TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 32 , 32 ); + TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 32 , 16 ); + TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 16 , 16 ); +} + } //---------------------------------------------------------------------------- -#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) +#if defined( KOKKOS_ENABLE_TASKPOLICY ) -TEST_F( cuda , task_policy ) +TEST_F( cuda , task_fib ) +{ + for ( int i = 0 ; i < 25 ; ++i ) { + TestTaskPolicy::TestFib< Kokkos::Cuda >::run(i, (i+1)*1000000 ); + } +} + +TEST_F( cuda , task_depend ) +{ + for ( int i = 0 ; i < 25 ; ++i ) { + TestTaskPolicy::TestTaskDependence< Kokkos::Cuda >::run(i); + } +} + +TEST_F( cuda , task_team ) +{ + //TestTaskPolicy::TestTaskTeam< Kokkos::Cuda >::run(1000); + TestTaskPolicy::TestTaskTeam< Kokkos::Cuda >::run(104); + TestTaskPolicy::TestTaskTeamValue< Kokkos::Cuda >::run(1000); +} + +//---------------------------------------------------------------------------- + +TEST_F( cuda , old_task_policy ) { TestTaskPolicy::test_task_dep< Kokkos::Cuda >( 10 ); @@ -598,16 +360,16 @@ TEST_F( cuda , task_policy ) } } -TEST_F( cuda , task_team ) +TEST_F( cuda , old_task_team ) { TestTaskPolicy::test_task_team< Kokkos::Cuda >(1000); } -TEST_F( cuda , task_latch ) +TEST_F( cuda , old_task_latch ) { TestTaskPolicy::test_latch< Kokkos::Cuda >(10); TestTaskPolicy::test_latch< Kokkos::Cuda >(1000); } -#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */ +#endif // #if defined( KOKKOS_ENABLE_TASKPOLICY ) diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp similarity index 93% rename from lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp rename to lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp index a1e3f8fb0a..a17ed97a9f 100644 --- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp @@ -257,11 +257,13 @@ protected: } }; - +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 TEST_F( defaultdevicetypeinit, no_args) { Impl::test_no_arguments(); } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 TEST_F( defaultdevicetypeinit, commandline_args_empty) { Kokkos::InitArguments argstruct; int nargs = 0; @@ -271,7 +273,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_empty) { delete [] args[i]; delete [] args; } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 TEST_F( defaultdevicetypeinit, commandline_args_other) { Kokkos::InitArguments argstruct; int nargs = 0; @@ -281,7 +285,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_other) { delete [] args[i]; delete [] args; } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 TEST_F( defaultdevicetypeinit, commandline_args_nthreads) { Kokkos::InitArguments argstruct; int nargs = 0; @@ -291,7 +297,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_nthreads) { delete [] args[i]; delete [] args; } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa) { Kokkos::InitArguments argstruct; int nargs = 0; @@ -301,7 +309,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa) { delete [] args[i]; delete [] args; } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device) { Kokkos::InitArguments argstruct; int nargs = 0; @@ -311,7 +321,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device) { delete [] args[i]; delete [] args; } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device) { Kokkos::InitArguments argstruct; int nargs = 0; @@ -321,7 +333,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device) { delete [] args[i]; delete [] args; } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 TEST_F( defaultdevicetypeinit, commandline_args_numa_device) { Kokkos::InitArguments argstruct; int nargs = 0; @@ -331,7 +345,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_numa_device) { delete [] args[i]; delete [] args; } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 TEST_F( defaultdevicetypeinit, commandline_args_device) { Kokkos::InitArguments argstruct; int nargs = 0; @@ -341,7 +357,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_device) { delete [] args[i]; delete [] args; } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) { Kokkos::InitArguments argstruct; int nargs = 0; @@ -351,38 +369,49 @@ TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) { delete [] args[i]; delete [] args; } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 TEST_F( defaultdevicetypeinit, initstruct_default) { Kokkos::InitArguments args; Impl::test_initstruct_args(args); } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 TEST_F( defaultdevicetypeinit, initstruct_nthreads) { Kokkos::InitArguments args = Impl::init_initstruct(true,false,false); Impl::test_initstruct_args(args); } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa) { Kokkos::InitArguments args = Impl::init_initstruct(true,true,false); Impl::test_initstruct_args(args); } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 TEST_F( defaultdevicetypeinit, initstruct_device) { Kokkos::InitArguments args = Impl::init_initstruct(false,false,true); Impl::test_initstruct_args(args); } +#endif +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 TEST_F( defaultdevicetypeinit, initstruct_nthreads_device) { Kokkos::InitArguments args = Impl::init_initstruct(true,false,true); Impl::test_initstruct_args(args); } +#endif - +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa_device) { Kokkos::InitArguments args = Impl::init_initstruct(true,true,true); Impl::test_initstruct_args(args); } - +#endif } // namespace test diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_1.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_1.cpp new file mode 100644 index 0000000000..40a773b3b8 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_1.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_10.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_10.cpp new file mode 100644 index 0000000000..f12c4f62b2 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_10.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_11.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_11.cpp new file mode 100644 index 0000000000..c7ffd7b94e --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_11.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_12.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_12.cpp new file mode 100644 index 0000000000..24e2b15201 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_12.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_13.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_13.cpp new file mode 100644 index 0000000000..7968c13b66 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_13.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_14.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_14.cpp new file mode 100644 index 0000000000..ab0563c6dc --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_14.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_15.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_15.cpp new file mode 100644 index 0000000000..70a8ca1727 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_15.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_16.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_16.cpp new file mode 100644 index 0000000000..727c7a95eb --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_16.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_2.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_2.cpp new file mode 100644 index 0000000000..88fba34c50 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_2.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_3.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_3.cpp new file mode 100644 index 0000000000..b3562cc53d --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_3.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_4.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_4.cpp new file mode 100644 index 0000000000..0d4983319c --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_4.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_5.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_5.cpp new file mode 100644 index 0000000000..026fb01f88 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_5.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_6.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_6.cpp new file mode 100644 index 0000000000..937a13160e --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_6.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_7.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_7.cpp new file mode 100644 index 0000000000..992c854c1a --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_7.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_8.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_8.cpp new file mode 100644 index 0000000000..07a8b1cb7c --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_8.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_9.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_9.cpp new file mode 100644 index 0000000000..4d8c05be2d --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_9.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 +#include<TestDefaultDeviceTypeInit.hpp> diff --git a/lib/kokkos/core/src/impl/Kokkos_MemoryPool.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp similarity index 77% rename from lib/kokkos/core/src/impl/Kokkos_MemoryPool.cpp rename to lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp index 70f0545b2c..c15f812233 100644 --- a/lib/kokkos/core/src/impl/Kokkos_MemoryPool.cpp +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,9 +36,41 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ -#include<impl/Kokkos_MemoryPool_Inline.hpp> +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_HAVE_CUDA) || defined(__CUDACC__) +//---------------------------------------------------------------------------- + +#include <TestReduce.hpp> + + +namespace Test { + +class defaultdevicetype : public ::testing::Test { +protected: + static void SetUpTestCase() + { + Kokkos::initialize(); + } + + static void TearDownTestCase() + { + Kokkos::finalize(); + } +}; + + +TEST_F( defaultdevicetype, reduce_instantiation) { + TestReduceCombinatoricalInstantiation<>::execute(); +} + +} // namespace test + +#endif diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp new file mode 100644 index 0000000000..9894d1ce69 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestMDRange.hpp @@ -0,0 +1,555 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdio.h> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Test { +namespace { + +template <typename ExecSpace > +struct TestMDRange_2D { + + using DataType = int ; + using ViewType = typename Kokkos::View< DataType** , ExecSpace > ; + using HostViewType = typename ViewType::HostMirror ; + + ViewType input_view ; + + TestMDRange_2D( const DataType N0, const DataType N1 ) : input_view("input_view", N0, N1) {} + + KOKKOS_INLINE_FUNCTION + void operator()( const int i , const int j ) const + { + input_view(i,j) = 1; + } + + + static void test_for2( const int64_t N0, const int64_t N1 ) + { + + using namespace Kokkos::Experimental; + + { + using range_type = MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> >; + range_type range( {0,0}, {N0,N1} ); + TestMDRange_2D functor(N0,N1); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + if ( h_view(i,j) != 1 ) { + ++counter; + } + }} + if ( counter != 0 ) + printf(" Errors in test_for2; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> >; + + range_type range( {0,0}, {N0,N1} ); + TestMDRange_2D functor(N0,N1); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + if ( h_view(i,j) != 1 ) { + ++counter; + } + }} + if ( counter != 0 ) + printf(" Errors in test_for2; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Flat >, Kokkos::IndexType<int> >; + + range_type range( {0,0}, {N0,N1} ); + TestMDRange_2D functor(N0,N1); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + if ( h_view(i,j) != 1 ) { + ++counter; + } + }} + if ( counter != 0 ) + printf(" Errors in test_for2; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Flat >, Kokkos::IndexType<int> >; + + range_type range( {0,0}, {N0,N1} ); + TestMDRange_2D functor(N0,N1); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + if ( h_view(i,j) != 1 ) { + ++counter; + } + }} + if ( counter != 0 ) + printf(" Errors in test_for2; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Flat >, Kokkos::IndexType<int> >; + + range_type range( {0,0}, {N0,N1} ); + TestMDRange_2D functor(N0,N1); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + if ( h_view(i,j) != 1 ) { + ++counter; + } + }} + if ( counter != 0 ) + printf(" Errors in test_for2; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left , Iterate::Left >, Kokkos::IndexType<int> >; + + range_type range( {0,0}, {N0,N1}, {3,3} ); + TestMDRange_2D functor(N0,N1); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + if ( h_view(i,j) != 1 ) { + ++counter; + } + }} + if ( counter != 0 ) + printf(" Errors in test_for2; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left , Iterate::Right >, Kokkos::IndexType<int> >; + + range_type range( {0,0}, {N0,N1}, {7,7} ); + TestMDRange_2D functor(N0,N1); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + if ( h_view(i,j) != 1 ) { + ++counter; + } + }} + if ( counter != 0 ) + printf(" Errors in test_for2; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Left >, Kokkos::IndexType<int> >; + + range_type range( {0,0}, {N0,N1}, {16,16} ); + TestMDRange_2D functor(N0,N1); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + if ( h_view(i,j) != 1 ) { + ++counter; + } + }} + if ( counter != 0 ) + printf(" Errors in test_for2; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Right >, Kokkos::IndexType<int> >; + + range_type range( {0,0}, {N0,N1}, {5,16} ); + TestMDRange_2D functor(N0,N1); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + if ( h_view(i,j) != 1 ) { + ++counter; + } + }} + if ( counter != 0 ) + printf(" Errors in test_for2; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + } //end test_for2 +}; //MDRange_2D + +template <typename ExecSpace > +struct TestMDRange_3D { + + using DataType = int ; + using ViewType = typename Kokkos::View< DataType*** , ExecSpace > ; + using HostViewType = typename ViewType::HostMirror ; + + ViewType input_view ; + + TestMDRange_3D( const DataType N0, const DataType N1, const DataType N2 ) : input_view("input_view", N0, N1, N2) {} + + KOKKOS_INLINE_FUNCTION + void operator()( const int i , const int j , const int k ) const + { + input_view(i,j,k) = 1; + } + + static void test_for3( const int64_t N0, const int64_t N1, const int64_t N2 ) + { + using namespace Kokkos::Experimental; + + { + using range_type = MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> >; + + range_type range( {0,0,0}, {N0,N1,N2} ); + TestMDRange_3D functor(N0,N1,N2); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + for ( int k=0; k<N2; ++k ) { + if ( h_view(i,j,k) != 1 ) { + ++counter; + } + }}} + if ( counter != 0 ) + printf(" Errors in test_for3; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> >; + + range_type range( {0,0,0}, {N0,N1,N2} ); + TestMDRange_3D functor(N0,N1,N2); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + for ( int k=0; k<N2; ++k ) { + if ( h_view(i,j,k) != 1 ) { + ++counter; + } + }}} + if ( counter != 0 ) + printf(" Errors in test_for3; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Flat, Iterate::Default>, Kokkos::IndexType<int> >; + + range_type range( {0,0,0}, {N0,N1,N2} ); + TestMDRange_3D functor(N0,N1,N2); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + for ( int k=0; k<N2; ++k ) { + if ( h_view(i,j,k) != 1 ) { + ++counter; + } + }}} + if ( counter != 0 ) + printf(" Errors in test_for3; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Flat, Iterate::Flat >, Kokkos::IndexType<int> >; + + range_type range( {0,0,0}, {N0,N1,N2} ); + TestMDRange_3D functor(N0,N1,N2); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + for ( int k=0; k<N2; ++k ) { + if ( h_view(i,j,k) != 1 ) { + ++counter; + } + }}} + if ( counter != 0 ) + printf(" Errors in test_for3; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Flat >, Kokkos::IndexType<int> >; + + range_type range( {0,0,0}, {N0,N1,N2} ); + TestMDRange_3D functor(N0,N1,N2); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + for ( int k=0; k<N2; ++k ) { + if ( h_view(i,j,k) != 1 ) { + ++counter; + } + }}} + if ( counter != 0 ) + printf(" Errors in test_for3; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Flat >, Kokkos::IndexType<int> >; + + range_type range( {0,0,0}, {N0,N1,N2} ); + TestMDRange_3D functor(N0,N1,N2); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + for ( int k=0; k<N2; ++k ) { + if ( h_view(i,j,k) != 1 ) { + ++counter; + } + }}} + if ( counter != 0 ) + printf(" Errors in test_for3; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Left >, Kokkos::IndexType<int> >; + + range_type range( {0,0,0}, {N0,N1,N2}, {2,4,2} ); + TestMDRange_3D functor(N0,N1,N2); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + for ( int k=0; k<N2; ++k ) { + if ( h_view(i,j,k) != 1 ) { + ++counter; + } + }}} + if ( counter != 0 ) + printf(" Errors in test_for3; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Right >, Kokkos::IndexType<int> >; + + range_type range( {0,0,0}, {N0,N1,N2}, {3,5,7} ); + TestMDRange_3D functor(N0,N1,N2); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + for ( int k=0; k<N2; ++k ) { + if ( h_view(i,j,k) != 1 ) { + ++counter; + } + }}} + if ( counter != 0 ) + printf(" Errors in test_for3; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Left >, Kokkos::IndexType<int> >; + + range_type range( {0,0,0}, {N0,N1,N2}, {8,8,8} ); + TestMDRange_3D functor(N0,N1,N2); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + for ( int k=0; k<N2; ++k ) { + if ( h_view(i,j,k) != 1 ) { + ++counter; + } + }}} + if ( counter != 0 ) + printf(" Errors in test_for3; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + { + using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Right >, Kokkos::IndexType<int> >; + + range_type range( {0,0,0}, {N0,N1,N2}, {2,4,2} ); + TestMDRange_3D functor(N0,N1,N2); + + md_parallel_for( range, functor ); + + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view , functor.input_view ); + + int counter = 0; + for ( int i=0; i<N0; ++i ) { + for ( int j=0; j<N1; ++j ) { + for ( int k=0; k<N2; ++k ) { + if ( h_view(i,j,k) != 1 ) { + ++counter; + } + }}} + if ( counter != 0 ) + printf(" Errors in test_for3; mismatches = %d\n\n",counter); + ASSERT_EQ( counter , 0 ); + } + + } //end test_for3 +}; + +} /* namespace */ +} /* namespace Test */ + +/*--------------------------------------------------------------------------*/ + diff --git a/lib/kokkos/core/unit_test/TestMemoryPool.hpp b/lib/kokkos/core/unit_test/TestMemoryPool.hpp index b49d91e6a8..cf650b0bc8 100644 --- a/lib/kokkos/core/unit_test/TestMemoryPool.hpp +++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp @@ -48,37 +48,45 @@ #include <stdio.h> #include <iostream> #include <cmath> +#include <algorithm> #include <impl/Kokkos_Timer.hpp> //#define TESTMEMORYPOOL_PRINT //#define TESTMEMORYPOOL_PRINT_STATUS +#ifdef KOKKOS_HAVE_CUDA +#define STRIDE 32 +#else +#define STRIDE 1 +#endif + namespace TestMemoryPool { struct pointer_obj { - uint64_t * ptr; + uint64_t * ptr; +}; + +struct pointer_obj2 { + void * ptr; + size_t size; }; -template < typename PointerView, typename MemorySpace > +template < typename PointerView, typename Allocator > struct allocate_memory { typedef typename PointerView::execution_space execution_space; typedef typename execution_space::size_type size_type; - enum { STRIDE = 32 }; + PointerView m_pointers; + size_t m_chunk_size; + Allocator m_mempool; - PointerView m_pointers; - size_t m_num_ptrs; - size_t m_chunk_size; - MemorySpace m_space; - - allocate_memory( PointerView & ptrs, size_t nptrs, - size_t cs, MemorySpace & sp ) - : m_pointers( ptrs ), m_num_ptrs( nptrs ), - m_chunk_size( cs ), m_space( sp ) + allocate_memory( PointerView & ptrs, size_t num_ptrs, + size_t cs, Allocator & m ) + : m_pointers( ptrs ), m_chunk_size( cs ), m_mempool( m ) { // Initialize the view with the out degree of each vertex. - Kokkos::parallel_for( m_num_ptrs * STRIDE , *this ); + Kokkos::parallel_for( num_ptrs * STRIDE, *this ); } KOKKOS_INLINE_FUNCTION @@ -86,26 +94,55 @@ struct allocate_memory { { if ( i % STRIDE == 0 ) { m_pointers[i / STRIDE].ptr = - static_cast< uint64_t * >( m_space.allocate( m_chunk_size ) ); + static_cast< uint64_t * >( m_mempool.allocate( m_chunk_size ) ); } } }; template < typename PointerView > -struct fill_memory { +struct count_invalid_memory { typedef typename PointerView::execution_space execution_space; typedef typename execution_space::size_type size_type; + typedef uint64_t value_type; + + PointerView m_pointers; + uint64_t & m_result; + + count_invalid_memory( PointerView & ptrs, size_t num_ptrs, uint64_t & res ) + : m_pointers( ptrs ), m_result( res ) + { + // Initialize the view with the out degree of each vertex. + Kokkos::parallel_reduce( num_ptrs * STRIDE, *this, m_result ); + } + + KOKKOS_INLINE_FUNCTION + void init( value_type & v ) const + { v = 0; } + + KOKKOS_INLINE_FUNCTION + void join( volatile value_type & dst, volatile value_type const & src ) const + { dst += src; } + + KOKKOS_INLINE_FUNCTION + void operator()( size_type i, value_type & r ) const + { + if ( i % STRIDE == 0 ) { + r += ( m_pointers[i / STRIDE].ptr == 0 ); + } + } +}; - enum { STRIDE = 32 }; +template < typename PointerView > +struct fill_memory { + typedef typename PointerView::execution_space execution_space; + typedef typename execution_space::size_type size_type; PointerView m_pointers; - size_t m_num_ptrs; - fill_memory( PointerView & ptrs, size_t nptrs ) - : m_pointers( ptrs ), m_num_ptrs( nptrs ) + fill_memory( PointerView & ptrs, size_t num_ptrs ) : m_pointers( ptrs ) { // Initialize the view with the out degree of each vertex. - Kokkos::parallel_for( m_num_ptrs * STRIDE , *this ); + Kokkos::parallel_for( num_ptrs * STRIDE, *this ); } KOKKOS_INLINE_FUNCTION @@ -123,17 +160,14 @@ struct sum_memory { typedef typename execution_space::size_type size_type; typedef uint64_t value_type; - enum { STRIDE = 32 }; + PointerView m_pointers; + uint64_t & m_result; - PointerView m_pointers; - size_t m_num_ptrs; - uint64_t & result; - - sum_memory( PointerView & ptrs, size_t nptrs, uint64_t & res ) - : m_pointers( ptrs ), m_num_ptrs( nptrs ), result( res ) + sum_memory( PointerView & ptrs, size_t num_ptrs, uint64_t & res ) + : m_pointers( ptrs ), m_result( res ) { // Initialize the view with the out degree of each vertex. - Kokkos::parallel_reduce( m_num_ptrs * STRIDE , *this, result ); + Kokkos::parallel_reduce( num_ptrs * STRIDE, *this, m_result ); } KOKKOS_INLINE_FUNCTION @@ -153,65 +187,70 @@ struct sum_memory { } }; -template < typename PointerView, typename MemorySpace > +template < typename PointerView, typename Allocator > struct deallocate_memory { typedef typename PointerView::execution_space execution_space; typedef typename execution_space::size_type size_type; - enum { STRIDE = 32 }; + PointerView m_pointers; + size_t m_chunk_size; + Allocator m_mempool; - PointerView m_pointers; - size_t m_num_ptrs; - size_t m_chunk_size; - MemorySpace m_space; - - deallocate_memory( PointerView & ptrs, size_t nptrs, - size_t cs, MemorySpace & sp ) - : m_pointers( ptrs ), m_num_ptrs( nptrs ), m_chunk_size( cs ), m_space( sp ) + deallocate_memory( PointerView & ptrs, size_t num_ptrs, + size_t cs, Allocator & m ) + : m_pointers( ptrs ), m_chunk_size( cs ), m_mempool( m ) { // Initialize the view with the out degree of each vertex. - Kokkos::parallel_for( m_num_ptrs * STRIDE , *this ); + Kokkos::parallel_for( num_ptrs * STRIDE, *this ); } KOKKOS_INLINE_FUNCTION void operator()( size_type i ) const { if ( i % STRIDE == 0 ) { - m_space.deallocate( m_pointers[i / STRIDE].ptr, m_chunk_size ); + m_mempool.deallocate( m_pointers[i / STRIDE].ptr, m_chunk_size ); } } }; -template < typename ExecutionSpace, typename MemorySpace > +template < typename WorkView, typename PointerView, typename ScalarView, + typename Allocator > struct allocate_deallocate_memory { - typedef ExecutionSpace execution_space; + typedef typename WorkView::execution_space execution_space; typedef typename execution_space::size_type size_type; - enum { STRIDE = 32 }; - - size_t m_num_max_chunks; - size_t m_max_chunk_size; - size_t m_min_chunk_size; - size_t m_chunk_spacing; - MemorySpace m_space; + WorkView m_work; + PointerView m_pointers; + ScalarView m_ptrs_front; + ScalarView m_ptrs_back; + Allocator m_mempool; - allocate_deallocate_memory( size_t nmc, size_t max_cs, - size_t min_cs, size_t cs, MemorySpace & sp ) - : m_num_max_chunks( nmc ), m_max_chunk_size( max_cs ), - m_min_chunk_size( min_cs ), m_chunk_spacing( cs ), m_space( sp ) + allocate_deallocate_memory( WorkView & w, size_t work_size, PointerView & p, + ScalarView pf, ScalarView pb, Allocator & m ) + : m_work( w ), m_pointers( p ), m_ptrs_front( pf ), m_ptrs_back( pb ), + m_mempool( m ) { - Kokkos::parallel_for( m_num_max_chunks * STRIDE, *this ); + // Initialize the view with the out degree of each vertex. + Kokkos::parallel_for( work_size * STRIDE, *this ); } KOKKOS_INLINE_FUNCTION void operator()( size_type i ) const { if ( i % STRIDE == 0 ) { - for ( size_t j = m_max_chunk_size; j >= m_min_chunk_size; j /= m_chunk_spacing ) { - for ( size_t k = 0; k < 10; ++k ) { - void * mem = m_space.allocate( j ); - m_space.deallocate( mem, j ); - } + unsigned my_work = m_work[i / STRIDE]; + + if ( ( my_work & 1 ) == 0 ) { + // Allocation. + size_t pos = Kokkos::atomic_fetch_add( &m_ptrs_back(), 1 ); + size_t alloc_size = my_work >> 1; + m_pointers[pos].ptr = m_mempool.allocate( alloc_size ); + m_pointers[pos].size = alloc_size; + } + else { + // Deallocation. + size_t pos = Kokkos::atomic_fetch_add( &m_ptrs_front(), 1 ); + m_mempool.deallocate( m_pointers[pos].ptr, m_pointers[pos].size ); } } } @@ -255,12 +294,14 @@ void print_results( const std::string & text, unsigned long long width, // pool and breaking large chunks into smaller chunks to fulfill allocation // requests. It verifies that MemoryPool(), allocate(), and deallocate() work // correctly. -template < class ExecSpace, class MemorySpace = typename ExecSpace::memory_space > +template < class Device > bool test_mempool( size_t chunk_size, size_t total_size ) { - typedef Kokkos::View< pointer_obj *, ExecSpace > pointer_view; - typedef Kokkos::Experimental::MemoryPool< MemorySpace , ExecSpace > - pool_memory_space; + typedef typename Device::execution_space execution_space; + typedef typename Device::memory_space memory_space; + typedef Device device_type; + typedef Kokkos::View< pointer_obj *, device_type > pointer_view; + typedef Kokkos::Experimental::MemoryPool< device_type > pool_memory_space; uint64_t result; size_t num_chunks = total_size / chunk_size; @@ -269,7 +310,8 @@ bool test_mempool( size_t chunk_size, size_t total_size ) pointer_view pointers( "pointers", num_chunks ); #ifdef TESTMEMORYPOOL_PRINT - std::cout << std::setw( SHIFTW ) << "chunk_size: " << std::setw( 12 ) + std::cout << "*** test_mempool() ***" << std::endl + << std::setw( SHIFTW ) << "chunk_size: " << std::setw( 12 ) << chunk_size << std::endl << std::setw( SHIFTW ) << "total_size: " << std::setw( 12 ) << total_size << std::endl @@ -277,46 +319,53 @@ bool test_mempool( size_t chunk_size, size_t total_size ) << num_chunks << std::endl; double elapsed_time = 0; - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; #endif - pool_memory_space m_space( MemorySpace(), chunk_size, total_size ); + pool_memory_space mempool( memory_space(), total_size * 1.2, 20 ); #ifdef TESTMEMORYPOOL_PRINT - ExecSpace::fence(); + execution_space::fence(); elapsed_time = timer.seconds(); print_results( "initialize mempool: ", elapsed_time ); #ifdef TESTMEMORYPOOL_PRINT_STATUS - m_space.print_status(); + mempool.print_status(); #endif timer.reset(); #endif - // Tests: - // test for correct behvior when out of memory - // test for correct behvior when interleaving allocate() and deallocate() - { allocate_memory< pointer_view, pool_memory_space > - am( pointers, num_chunks, chunk_size, m_space ); + am( pointers, num_chunks, chunk_size, mempool ); } #ifdef TESTMEMORYPOOL_PRINT - ExecSpace::fence(); + execution_space::fence(); elapsed_time = timer.seconds(); print_results( "allocate chunks: ", elapsed_time ); #ifdef TESTMEMORYPOOL_PRINT_STATUS - m_space.print_status(); + mempool.print_status(); #endif timer.reset(); #endif + { + count_invalid_memory< pointer_view > sm( pointers, num_chunks, result ); + } + +#ifdef TESTMEMORYPOOL_PRINT + execution_space::fence(); + elapsed_time = timer.seconds(); + print_results( "invalid chunks: ", 16, elapsed_time, result ); + timer.reset(); +#endif + { fill_memory< pointer_view > fm( pointers, num_chunks ); } #ifdef TESTMEMORYPOOL_PRINT - ExecSpace::fence(); + execution_space::fence(); elapsed_time = timer.seconds(); print_results( "fill chunks: ", elapsed_time ); timer.reset(); @@ -326,10 +375,11 @@ bool test_mempool( size_t chunk_size, size_t total_size ) sum_memory< pointer_view > sm( pointers, num_chunks, result ); } + execution_space::fence(); + #ifdef TESTMEMORYPOOL_PRINT - ExecSpace::fence(); elapsed_time = timer.seconds(); - print_results( "sum chunks: ", 10, elapsed_time, result ); + print_results( "sum chunks: ", 16, elapsed_time, result ); #endif if ( result != ( num_chunks * ( num_chunks - 1 ) ) / 2 ) { @@ -343,40 +393,51 @@ bool test_mempool( size_t chunk_size, size_t total_size ) { deallocate_memory< pointer_view, pool_memory_space > - dm( pointers, num_chunks, chunk_size, m_space ); + dm( pointers, num_chunks, chunk_size, mempool ); } #ifdef TESTMEMORYPOOL_PRINT - ExecSpace::fence(); + execution_space::fence(); elapsed_time = timer.seconds(); print_results( "deallocate chunks: ", elapsed_time ); #ifdef TESTMEMORYPOOL_PRINT_STATUS - m_space.print_status(); + mempool.print_status(); #endif timer.reset(); #endif { allocate_memory< pointer_view, pool_memory_space > - am( pointers, num_chunks, chunk_size, m_space ); + am( pointers, num_chunks, chunk_size, mempool ); } #ifdef TESTMEMORYPOOL_PRINT - ExecSpace::fence(); + execution_space::fence(); elapsed_time = timer.seconds(); print_results( "allocate chunks: ", elapsed_time ); #ifdef TESTMEMORYPOOL_PRINT_STATUS - m_space.print_status(); + mempool.print_status(); #endif timer.reset(); #endif + { + count_invalid_memory< pointer_view > sm( pointers, num_chunks, result ); + } + +#ifdef TESTMEMORYPOOL_PRINT + execution_space::fence(); + elapsed_time = timer.seconds(); + print_results( "invalid chunks: ", 16, elapsed_time, result ); + timer.reset(); +#endif + { fill_memory< pointer_view > fm( pointers, num_chunks ); } #ifdef TESTMEMORYPOOL_PRINT - ExecSpace::fence(); + execution_space::fence(); elapsed_time = timer.seconds(); print_results( "fill chunks: ", elapsed_time ); timer.reset(); @@ -386,10 +447,11 @@ bool test_mempool( size_t chunk_size, size_t total_size ) sum_memory< pointer_view > sm( pointers, num_chunks, result ); } + execution_space::fence(); + #ifdef TESTMEMORYPOOL_PRINT - ExecSpace::fence(); elapsed_time = timer.seconds(); - print_results( "sum chunks: ", 10, elapsed_time, result ); + print_results( "sum chunks: ", 16, elapsed_time, result ); #endif if ( result != ( num_chunks * ( num_chunks - 1 ) ) / 2 ) { @@ -403,78 +465,340 @@ bool test_mempool( size_t chunk_size, size_t total_size ) { deallocate_memory< pointer_view, pool_memory_space > - dm( pointers, num_chunks, chunk_size, m_space ); + dm( pointers, num_chunks, chunk_size, mempool ); } #ifdef TESTMEMORYPOOL_PRINT - ExecSpace::fence(); + execution_space::fence(); elapsed_time = timer.seconds(); print_results( "deallocate chunks: ", elapsed_time ); #ifdef TESTMEMORYPOOL_PRINT_STATUS - m_space.print_status(); + mempool.print_status(); #endif #endif return return_val; } +template < typename T > +T smallest_power2_ge( T val ) +{ + // Find the most significant nonzero bit. + int first_nonzero_bit = Kokkos::Impl::bit_scan_reverse( val ); + + // If val is an integral power of 2, ceil( log2(val) ) is equal to the + // most significant nonzero bit. Otherwise, you need to add 1. + int lg2_size = first_nonzero_bit + + !Kokkos::Impl::is_integral_power_of_two( val ); + + return T(1) << T(lg2_size); +} + // This test makes allocation requests for multiple sizes and interleaves // allocation and deallocation. -template < class ExecSpace, class MemorySpace = typename ExecSpace::memory_space > -void test_mempool2( size_t chunk_size, size_t total_size ) +// +// There are 3 phases. The first phase does only allocations to build up a +// working state for the allocator. The second phase interleaves allocations +// and deletions. The third phase does only deallocations to undo all the +// allocations from the first phase. By building first to a working state, +// allocations and deallocations can happen in any order for the second phase. +// Each phase performs on multiple chunk sizes. +template < class Device > +void test_mempool2( unsigned base_chunk_size, size_t num_chunk_sizes, + size_t phase1_size, size_t phase2_size ) { - typedef Kokkos::Experimental::MemoryPool< MemorySpace , ExecSpace > - pool_memory_space; +#ifdef TESTMEMORYPOOL_PRINT + typedef typename Device::execution_space execution_space; +#endif + typedef typename Device::memory_space memory_space; + typedef Device device_type; + typedef Kokkos::View< unsigned *, device_type > work_view; + typedef Kokkos::View< size_t, device_type > scalar_view; + typedef Kokkos::View< pointer_obj2 *, device_type > pointer_view; + typedef Kokkos::Experimental::MemoryPool< device_type > pool_memory_space; + + enum { + MIN_CHUNK_SIZE = 64, + MIN_BASE_CHUNK_SIZE = MIN_CHUNK_SIZE / 2 + 1 + }; + + // Make sure the base chunk size is at least MIN_BASE_CHUNK_SIZE bytes, so + // all the different chunk sizes translate to different block sizes for the + // allocator. + if ( base_chunk_size < MIN_BASE_CHUNK_SIZE ) { + base_chunk_size = MIN_BASE_CHUNK_SIZE; + } + + // Get the smallest power of 2 >= the base chunk size. The size must be + // >= MIN_CHUNK_SIZE, though. + unsigned ceil_base_chunk_size = smallest_power2_ge( base_chunk_size ); + if ( ceil_base_chunk_size < MIN_CHUNK_SIZE ) { + ceil_base_chunk_size = MIN_CHUNK_SIZE; + } + + // Make sure the phase 1 size is multiples of num_chunk_sizes. + phase1_size = ( ( phase1_size + num_chunk_sizes - 1 ) / num_chunk_sizes ) * + num_chunk_sizes; + + // Make sure the phase 2 size is multiples of (2 * num_chunk_sizes). + phase2_size = + ( ( phase2_size + 2 * num_chunk_sizes - 1 ) / ( 2 * num_chunk_sizes ) ) * + 2 * num_chunk_sizes; + + // The phase2 size must be <= twice the phase1 size so that deallocations + // can't happen before allocations. + if ( phase2_size > 2 * phase1_size ) phase2_size = 2 * phase1_size; + + size_t phase3_size = phase1_size; + size_t half_phase2_size = phase2_size / 2; + + // Each entry in the work views has the following format. The least + // significant bit indicates allocation (0) vs. deallocation (1). For + // allocation, the other bits indicate the desired allocation size. + + // Initialize the phase 1 work view with an equal number of allocations for + // each chunk size. + work_view phase1_work( "Phase 1 Work", phase1_size ); + typename work_view::HostMirror host_phase1_work = + create_mirror_view(phase1_work); + + size_t inner_size = phase1_size / num_chunk_sizes; + unsigned chunk_size = base_chunk_size; + + for ( size_t i = 0; i < num_chunk_sizes; ++i ) { + for ( size_t j = 0; j < inner_size; ++j ) { + host_phase1_work[i * inner_size + j] = chunk_size << 1; + } + + chunk_size *= 2; + } + + std::random_shuffle( host_phase1_work.ptr_on_device(), + host_phase1_work.ptr_on_device() + phase1_size ); + + deep_copy( phase1_work, host_phase1_work ); + + // Initialize the phase 2 work view with half allocations and half + // deallocations with an equal number of allocations for each chunk size. + work_view phase2_work( "Phase 2 Work", phase2_size ); + typename work_view::HostMirror host_phase2_work = + create_mirror_view(phase2_work); - size_t num_chunk_sizes = 4; - size_t chunk_spacing = 4; + inner_size = half_phase2_size / num_chunk_sizes; + chunk_size = base_chunk_size; + + for ( size_t i = 0; i < num_chunk_sizes; ++i ) { + for ( size_t j = 0; j < inner_size; ++j ) { + host_phase2_work[i * inner_size + j] = chunk_size << 1; + } + + chunk_size *= 2; + } + + for ( size_t i = half_phase2_size; i < phase2_size; ++i ) { + host_phase2_work[i] = 1; + } + + std::random_shuffle( host_phase2_work.ptr_on_device(), + host_phase2_work.ptr_on_device() + phase2_size ); + + deep_copy( phase2_work, host_phase2_work ); + + // Initialize the phase 3 work view with all deallocations. + work_view phase3_work( "Phase 3 Work", phase3_size ); + typename work_view::HostMirror host_phase3_work = + create_mirror_view(phase3_work); + + inner_size = phase3_size / num_chunk_sizes; + + for ( size_t i = 0; i < phase3_size; ++i ) host_phase3_work[i] = 1; + + deep_copy( phase3_work, host_phase3_work ); + + // Calculate the amount of memory needed for the allocator. We need to know + // the number of superblocks required for each chunk size and use that to + // calculate the amount of memory for each chunk size. + size_t lg_sb_size = 18; + size_t sb_size = 1 << lg_sb_size; + size_t total_size = 0; + size_t allocs_per_size = phase1_size / num_chunk_sizes + + half_phase2_size / num_chunk_sizes; + + chunk_size = ceil_base_chunk_size; + for ( size_t i = 0; i < num_chunk_sizes; ++i ) { + size_t my_size = allocs_per_size * chunk_size; + total_size += ( my_size + sb_size - 1 ) / sb_size * sb_size; + chunk_size *= 2; + } + + // Declare the queue to hold the records for allocated memory. An allocation + // adds a record to the back of the queue, and a deallocation removes a + // record from the front of the queue. + size_t num_allocations = phase1_size + half_phase2_size; + scalar_view ptrs_front( "Pointers front" ); + scalar_view ptrs_back( "Pointers back" ); + + pointer_view pointers( "pointers", num_allocations ); #ifdef TESTMEMORYPOOL_PRINT + printf( "\n*** test_mempool2() ***\n" ); + printf( " num_chunk_sizes: %12zu\n", num_chunk_sizes ); + printf( " base_chunk_size: %12u\n", base_chunk_size ); + printf( " ceil_base_chunk_size: %12u\n", ceil_base_chunk_size ); + printf( " phase1_size: %12zu\n", phase1_size ); + printf( " phase2_size: %12zu\n", phase2_size ); + printf( " phase3_size: %12zu\n", phase3_size ); + printf( " allocs_per_size: %12zu\n", allocs_per_size ); + printf( " num_allocations: %12zu\n", num_allocations ); + printf( " total_size: %12zu\n", total_size ); + fflush( stdout ); + double elapsed_time = 0; - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; #endif - pool_memory_space m_space( MemorySpace(), chunk_size, total_size, - num_chunk_sizes, chunk_spacing ); + pool_memory_space mempool( memory_space(), total_size * 1.2, lg_sb_size ); #ifdef TESTMEMORYPOOL_PRINT - ExecSpace::fence(); + execution_space::fence(); elapsed_time = timer.seconds(); print_results( "initialize mempool: ", elapsed_time ); + +#ifdef TESTMEMORYPOOL_PRINT_STATUS + mempool.print_status(); +#endif + + timer.reset(); +#endif + + { + allocate_deallocate_memory< work_view, pointer_view, scalar_view, + pool_memory_space > + adm( phase1_work, phase1_size, pointers, ptrs_front, ptrs_back, mempool ); + } + +#ifdef TESTMEMORYPOOL_PRINT + execution_space::fence(); + elapsed_time = timer.seconds(); + print_results( "phase1: ", elapsed_time ); + +#ifdef TESTMEMORYPOOL_PRINT_STATUS + mempool.print_status(); +#endif + + timer.reset(); +#endif + + { + allocate_deallocate_memory< work_view, pointer_view, scalar_view, + pool_memory_space > + adm( phase2_work, phase2_size, pointers, ptrs_front, ptrs_back, mempool ); + } + +#ifdef TESTMEMORYPOOL_PRINT + execution_space::fence(); + elapsed_time = timer.seconds(); + print_results( "phase2: ", elapsed_time ); + +#ifdef TESTMEMORYPOOL_PRINT_STATUS + mempool.print_status(); #endif - chunk_size = m_space.get_min_chunk_size(); - total_size = m_space.get_mem_size(); + timer.reset(); +#endif - // Get the chunk size for the largest possible chunk. - // max_chunk_size = - // chunk_size * (MEMPOOL_CHUNK_SPACING ^ (MEMPOOL_NUM_CHUNK_SIZES - 1)) - size_t max_chunk_size = chunk_size; - for (size_t i = 1; i < num_chunk_sizes; ++i) { - max_chunk_size *= chunk_spacing; + { + allocate_deallocate_memory< work_view, pointer_view, scalar_view, + pool_memory_space > + adm( phase3_work, phase3_size, pointers, ptrs_front, ptrs_back, mempool ); } - size_t num_max_chunks = total_size / ( max_chunk_size * num_chunk_sizes ); +#ifdef TESTMEMORYPOOL_PRINT + execution_space::fence(); + elapsed_time = timer.seconds(); + print_results( "phase3: ", elapsed_time ); +#ifdef TESTMEMORYPOOL_PRINT_STATUS + mempool.print_status(); +#endif +#endif +} + +// Tests for correct behavior when the allocator is out of memory. +template < class Device > +void test_memory_exhaustion() +{ +#ifdef TESTMEMORYPOOL_PRINT + typedef typename Device::execution_space execution_space; +#endif + typedef typename Device::memory_space memory_space; + typedef Device device_type; + typedef Kokkos::View< pointer_obj *, device_type > pointer_view; + typedef Kokkos::Experimental::MemoryPool< device_type > pool_memory_space; + + // The allocator will have a single superblock, and allocations will all be + // of the same chunk size. The allocation loop will attempt to allocate + // twice the number of chunks as are available in the allocator. The + // deallocation loop will only free the successfully allocated chunks. + + size_t chunk_size = 128; + size_t num_chunks = 128; + size_t half_num_chunks = num_chunks / 2; + size_t superblock_size = chunk_size * half_num_chunks; + size_t lg_superblock_size = + Kokkos::Impl::integral_power_of_two( superblock_size ); + pointer_view pointers( "pointers", num_chunks ); + +#ifdef TESTMEMORYPOOL_PRINT + std::cout << "\n*** test_memory_exhaustion() ***" << std::endl; + + double elapsed_time = 0; + Kokkos::Timer timer; +#endif + + pool_memory_space mempool( memory_space(), superblock_size, + lg_superblock_size ); + +#ifdef TESTMEMORYPOOL_PRINT + execution_space::fence(); + elapsed_time = timer.seconds(); + print_results( "initialize mempool: ", elapsed_time ); #ifdef TESTMEMORYPOOL_PRINT_STATUS - m_space.print_status(); + mempool.print_status(); +#endif + timer.reset(); #endif + { + allocate_memory< pointer_view, pool_memory_space > + am( pointers, num_chunks, chunk_size, mempool ); + } + #ifdef TESTMEMORYPOOL_PRINT + execution_space::fence(); + elapsed_time = timer.seconds(); + print_results( "allocate chunks: ", elapsed_time ); +#ifdef TESTMEMORYPOOL_PRINT_STATUS + mempool.print_status(); +#endif timer.reset(); #endif { - allocate_deallocate_memory< ExecSpace, pool_memory_space > - am( num_max_chunks, max_chunk_size, chunk_size, chunk_spacing, m_space ); + // In parallel, the allocations that succeeded were not put contiguously + // into the pointers View. The whole View can still be looped over and + // have deallocate called because deallocate will just do nothing for NULL + // pointers. + deallocate_memory< pointer_view, pool_memory_space > + dm( pointers, num_chunks, chunk_size, mempool ); } #ifdef TESTMEMORYPOOL_PRINT - ExecSpace::fence(); + execution_space::fence(); elapsed_time = timer.seconds(); - print_results( "allocate / deallocate: ", elapsed_time ); + print_results( "deallocate chunks: ", elapsed_time ); #ifdef TESTMEMORYPOOL_PRINT_STATUS - m_space.print_status(); + mempool.print_status(); #endif #endif } @@ -489,4 +813,8 @@ void test_mempool2( size_t chunk_size, size_t total_size ) #undef TESTMEMORYPOOL_PRINT_STATUS #endif +#ifdef STRIDE +#undef STRIDE +#endif + #endif diff --git a/lib/kokkos/core/unit_test/TestOpenMP.cpp b/lib/kokkos/core/unit_test/TestOpenMP.cpp index 35bc7c9869..6e8fc45179 100644 --- a/lib/kokkos/core/unit_test/TestOpenMP.cpp +++ b/lib/kokkos/core/unit_test/TestOpenMP.cpp @@ -55,6 +55,7 @@ #include <TestViewImpl.hpp> #include <TestAtomic.hpp> +#include <TestAtomicOperations.hpp> #include <TestViewAPI.hpp> #include <TestViewSubview.hpp> @@ -81,6 +82,7 @@ #include <TestPolicyConstruction.hpp> +#include <TestMDRange.hpp> namespace Test { @@ -97,6 +99,7 @@ protected: Kokkos::OpenMP::initialize( threads_count ); Kokkos::OpenMP::print_configuration( std::cout , true ); + srand(10231); } static void TearDownTestCase() @@ -110,6 +113,12 @@ protected: }; +TEST_F( openmp , md_range ) { + TestMDRange_2D< Kokkos::OpenMP >::test_for2(100,100); + + TestMDRange_3D< Kokkos::OpenMP >::test_for3(100,100,100); +} + TEST_F( openmp , impl_shared_alloc ) { test_shared_alloc< Kokkos::HostSpace , Kokkos::OpenMP >(); } @@ -180,5 +189,74 @@ TEST_F( openmp , atomics ) ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::OpenMP>(100,3) ) ); } +TEST_F( openmp , atomic_operations ) +{ + const int start = 1; //Avoid zero for division + const int end = 11; + for (int i = start; i < end; ++i) + { + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 4 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 4 ) ) ); + } + +} + } // namespace test diff --git a/lib/kokkos/core/unit_test/TestOpenMP_a.cpp b/lib/kokkos/core/unit_test/TestOpenMP_a.cpp index 919eea7c80..64eac66804 100644 --- a/lib/kokkos/core/unit_test/TestOpenMP_a.cpp +++ b/lib/kokkos/core/unit_test/TestOpenMP_a.cpp @@ -86,27 +86,8 @@ namespace Test { class openmp : public ::testing::Test { protected: - static void SetUpTestCase() - { - const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); - - const unsigned threads_count = std::max( 1u , numa_count ) * - std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 ); - - Kokkos::OpenMP::initialize( threads_count ); - Kokkos::OpenMP::print_configuration( std::cout , true ); - } - - static void TearDownTestCase() - { - Kokkos::OpenMP::finalize(); - - omp_set_num_threads(1); - - ASSERT_EQ( 1 , omp_get_max_threads() ); - } + static void SetUpTestCase(); + static void TearDownTestCase(); }; TEST_F( openmp, view_subview_auto_1d_left ) { diff --git a/lib/kokkos/core/unit_test/TestOpenMP_b.cpp b/lib/kokkos/core/unit_test/TestOpenMP_b.cpp index f024e22422..6cc2476014 100644 --- a/lib/kokkos/core/unit_test/TestOpenMP_b.cpp +++ b/lib/kokkos/core/unit_test/TestOpenMP_b.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -86,27 +86,8 @@ namespace Test { class openmp : public ::testing::Test { protected: - static void SetUpTestCase() - { - const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); - - const unsigned threads_count = std::max( 1u , numa_count ) * - std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 ); - - Kokkos::OpenMP::initialize( threads_count ); - Kokkos::OpenMP::print_configuration( std::cout , true ); - } - - static void TearDownTestCase() - { - Kokkos::OpenMP::finalize(); - - omp_set_num_threads(1); - - ASSERT_EQ( 1 , omp_get_max_threads() ); - } + static void SetUpTestCase(); + static void TearDownTestCase(); }; TEST_F( openmp , range_tag ) @@ -122,6 +103,10 @@ TEST_F( openmp , range_tag ) TEST_F( openmp , team_tag ) { + TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(2); + TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2); + TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2); + TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2); TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(1000); TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000); @@ -148,6 +133,14 @@ TEST_F( openmp, long_reduce_dynamic_view ) { TestReduceDynamicView< long , Kokkos::OpenMP >( 1000000 ); } +TEST_F( openmp , reducers ) +{ + TestReducers<int, Kokkos::OpenMP>::execute_integer(); + TestReducers<size_t, Kokkos::OpenMP>::execute_integer(); + TestReducers<double, Kokkos::OpenMP>::execute_float(); + TestReducers<Kokkos::complex<double>, Kokkos::OpenMP>::execute_basic(); +} + TEST_F( openmp, team_long_reduce) { TestReduceTeam< long , Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 3 ); TestReduceTeam< long , Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 3 ); @@ -172,12 +165,21 @@ TEST_F( openmp, team_scratch_request) { TestScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >(); } -#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) +#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) TEST_F( openmp, team_lambda_shared_request) { TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >(); TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >(); } #endif +TEST_F( openmp, shmem_size) { + TestShmemSize< Kokkos::OpenMP >(); +} + +TEST_F( openmp, multi_level_scratch) { + TestMultiLevelScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >(); + TestMultiLevelScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >(); +} + } // namespace test diff --git a/lib/kokkos/core/unit_test/TestOpenMP_c.cpp b/lib/kokkos/core/unit_test/TestOpenMP_c.cpp index d9ed87878a..f0cdabe913 100644 --- a/lib/kokkos/core/unit_test/TestOpenMP_c.cpp +++ b/lib/kokkos/core/unit_test/TestOpenMP_c.cpp @@ -71,6 +71,7 @@ #include <TestAggregateReduction.hpp> #include <TestCompilerMacros.hpp> #include <TestMemoryPool.hpp> +#include <TestTaskPolicy.hpp> #include <TestCXX11.hpp> @@ -86,27 +87,8 @@ namespace Test { class openmp : public ::testing::Test { protected: - static void SetUpTestCase() - { - const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); - - const unsigned threads_count = std::max( 1u , numa_count ) * - std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 ); - - Kokkos::OpenMP::initialize( threads_count ); - Kokkos::OpenMP::print_configuration( std::cout , true ); - } - - static void TearDownTestCase() - { - Kokkos::OpenMP::finalize(); - - omp_set_num_threads(1); - - ASSERT_EQ( 1 , omp_get_max_threads() ); - } + static void SetUpTestCase(); + static void TearDownTestCase(); }; TEST_F( openmp , view_remap ) @@ -197,7 +179,9 @@ TEST_F( openmp , memory_pool ) bool val = TestMemoryPool::test_mempool< Kokkos::OpenMP >( 128, 128000000 ); ASSERT_TRUE( val ); - TestMemoryPool::test_mempool2< Kokkos::OpenMP >( 128, 128000000 ); + TestMemoryPool::test_mempool2< Kokkos::OpenMP >( 64, 4, 1000000, 2000000 ); + + TestMemoryPool::test_memory_exhaustion< Kokkos::OpenMP >(); } //---------------------------------------------------------------------------- @@ -240,5 +224,39 @@ TEST_F( openmp , team_vector ) ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(9) ) ); ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(10) ) ); } + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + +TEST_F( openmp , task_fib ) +{ + for ( int i = 0 ; i < 25 ; ++i ) { + TestTaskPolicy::TestFib< Kokkos::OpenMP >::run(i, (i+1)*1000000 ); + } +} + +TEST_F( openmp , task_depend ) +{ + for ( int i = 0 ; i < 25 ; ++i ) { + TestTaskPolicy::TestTaskDependence< Kokkos::OpenMP >::run(i); + } +} + +TEST_F( openmp , task_team ) +{ + TestTaskPolicy::TestTaskTeam< Kokkos::OpenMP >::run(1000); + //TestTaskPolicy::TestTaskTeamValue< Kokkos::OpenMP >::run(1000); //TODO put back after testing +} + + +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ + + } // namespace test + + + + + diff --git a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp index 5aac8332fc..049138eb07 100644 --- a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp +++ b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp @@ -421,68 +421,68 @@ private: ASSERT_EQ (p1.league_size() , league_size); ASSERT_EQ (p1.team_size() , team_size); ASSERT_TRUE(p1.chunk_size() > 0); - ASSERT_EQ (p1.scratch_size(), 0); + ASSERT_EQ (p1.scratch_size(0), 0); policy_t p2 = p1.set_chunk_size(chunk_size); ASSERT_EQ (p1.league_size() , league_size); ASSERT_EQ (p1.team_size() , team_size); ASSERT_TRUE(p1.chunk_size() > 0); - ASSERT_EQ (p1.scratch_size(), 0); + ASSERT_EQ (p1.scratch_size(0), 0); ASSERT_EQ (p2.league_size() , league_size); ASSERT_EQ (p2.team_size() , team_size); ASSERT_EQ (p2.chunk_size() , chunk_size); - ASSERT_EQ (p2.scratch_size(), 0); + ASSERT_EQ (p2.scratch_size(0), 0); policy_t p3 = p2.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); ASSERT_EQ (p2.league_size() , league_size); ASSERT_EQ (p2.team_size() , team_size); ASSERT_EQ (p2.chunk_size() , chunk_size); - ASSERT_EQ (p2.scratch_size(), 0); + ASSERT_EQ (p2.scratch_size(0), 0); ASSERT_EQ (p3.league_size() , league_size); ASSERT_EQ (p3.team_size() , team_size); ASSERT_EQ (p3.chunk_size() , chunk_size); - ASSERT_EQ (p3.scratch_size(), per_team_scratch); + ASSERT_EQ (p3.scratch_size(0), per_team_scratch); policy_t p4 = p2.set_scratch_size(0,Kokkos::PerThread(per_thread_scratch)); ASSERT_EQ (p2.league_size() , league_size); ASSERT_EQ (p2.team_size() , team_size); ASSERT_EQ (p2.chunk_size() , chunk_size); - ASSERT_EQ (p2.scratch_size(), 0); + ASSERT_EQ (p2.scratch_size(0), 0); ASSERT_EQ (p4.league_size() , league_size); ASSERT_EQ (p4.team_size() , team_size); ASSERT_EQ (p4.chunk_size() , chunk_size); - ASSERT_EQ (p4.scratch_size(), per_thread_scratch*team_size); + ASSERT_EQ (p4.scratch_size(0), per_thread_scratch*team_size); policy_t p5 = p2.set_scratch_size(0,Kokkos::PerThread(per_thread_scratch),Kokkos::PerTeam(per_team_scratch)); ASSERT_EQ (p2.league_size() , league_size); ASSERT_EQ (p2.team_size() , team_size); ASSERT_EQ (p2.chunk_size() , chunk_size); - ASSERT_EQ (p2.scratch_size(), 0); + ASSERT_EQ (p2.scratch_size(0), 0); ASSERT_EQ (p5.league_size() , league_size); ASSERT_EQ (p5.team_size() , team_size); ASSERT_EQ (p5.chunk_size() , chunk_size); - ASSERT_EQ (p5.scratch_size(), scratch_size); + ASSERT_EQ (p5.scratch_size(0), scratch_size); policy_t p6 = p2.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch),Kokkos::PerThread(per_thread_scratch)); ASSERT_EQ (p2.league_size() , league_size); ASSERT_EQ (p2.team_size() , team_size); ASSERT_EQ (p2.chunk_size() , chunk_size); - ASSERT_EQ (p2.scratch_size(), 0); + ASSERT_EQ (p2.scratch_size(0), 0); ASSERT_EQ (p6.league_size() , league_size); ASSERT_EQ (p6.team_size() , team_size); ASSERT_EQ (p6.chunk_size() , chunk_size); - ASSERT_EQ (p6.scratch_size(), scratch_size); + ASSERT_EQ (p6.scratch_size(0), scratch_size); policy_t p7 = p3.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch),Kokkos::PerThread(per_thread_scratch)); ASSERT_EQ (p3.league_size() , league_size); ASSERT_EQ (p3.team_size() , team_size); ASSERT_EQ (p3.chunk_size() , chunk_size); - ASSERT_EQ (p3.scratch_size(), per_team_scratch); + ASSERT_EQ (p3.scratch_size(0), per_team_scratch); ASSERT_EQ (p7.league_size() , league_size); ASSERT_EQ (p7.team_size() , team_size); ASSERT_EQ (p7.chunk_size() , chunk_size); - ASSERT_EQ (p7.scratch_size(), scratch_size); + ASSERT_EQ (p7.scratch_size(0), scratch_size); } void test_run_time_parameters() { test_run_time_parameters_type<Kokkos::TeamPolicy<ExecutionSpace> >(); diff --git a/lib/kokkos/core/unit_test/TestQthread.cpp b/lib/kokkos/core/unit_test/TestQthread.cpp index a8f2acea1d..431b844c9f 100644 --- a/lib/kokkos/core/unit_test/TestQthread.cpp +++ b/lib/kokkos/core/unit_test/TestQthread.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -249,6 +249,10 @@ TEST_F( qthread, team_shared ) { TestSharedTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >(); } +TEST_F( qthread, shmem_size) { + TestShmemSize< Kokkos::Qthread >(); +} + TEST_F( qthread , team_scan ) { TestScanTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 10 ); diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp index c7fb7e9004..be8b4f90a3 100644 --- a/lib/kokkos/core/unit_test/TestRange.hpp +++ b/lib/kokkos/core/unit_test/TestRange.hpp @@ -185,7 +185,7 @@ struct TestRange { },error); ASSERT_EQ(error,0); - if(ExecSpace::concurrency()>1) { + if( ( ExecSpace::concurrency()>(int)1) && (N>static_cast<const size_t>(4*ExecSpace::concurrency())) ) { size_t min = N; size_t max = 0; for(int t=0; t<ExecSpace::concurrency(); t++) { @@ -196,6 +196,7 @@ struct TestRange { //if(ExecSpace::concurrency()>2) // ASSERT_TRUE(2*min<max); } + } { @@ -218,7 +219,7 @@ struct TestRange { },error); ASSERT_EQ(error,0); - if(ExecSpace::concurrency()>1) { + if( ( ExecSpace::concurrency()>(int)1) && (N>static_cast<const size_t>(4*ExecSpace::concurrency())) ) { size_t min = N; size_t max = 0; for(int t=0; t<ExecSpace::concurrency(); t++) { diff --git a/lib/kokkos/core/unit_test/TestReduce.hpp b/lib/kokkos/core/unit_test/TestReduce.hpp index f5ce0e4dd2..53fc393bcc 100644 --- a/lib/kokkos/core/unit_test/TestReduce.hpp +++ b/lib/kokkos/core/unit_test/TestReduce.hpp @@ -457,7 +457,1415 @@ public: } } }; +} + +// Computes y^T*A*x +// (modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar ) + +#if ( ! defined( KOKKOS_HAVE_CUDA ) ) || defined( KOKKOS_CUDA_USE_LAMBDA ) + +template< typename ScalarType , class DeviceType > +class TestTripleNestedReduce +{ +public: + typedef DeviceType execution_space ; + typedef typename execution_space::size_type size_type ; + + //------------------------------------ + + TestTripleNestedReduce( const size_type & nrows , const size_type & ncols + , const size_type & team_size , const size_type & vector_length ) + { + run_test( nrows , ncols , team_size, vector_length ); + } + + void run_test( const size_type & nrows , const size_type & ncols + , const size_type & team_size, const size_type & vector_length ) + { + //typedef Kokkos::LayoutLeft Layout; + typedef Kokkos::LayoutRight Layout; + + typedef Kokkos::View<ScalarType* , DeviceType> ViewVector; + typedef Kokkos::View<ScalarType** , Layout , DeviceType> ViewMatrix; + ViewVector y( "y" , nrows ); + ViewVector x( "x" , ncols ); + ViewMatrix A( "A" , nrows , ncols ); + + typedef Kokkos::RangePolicy<DeviceType> range_policy; + + // Initialize y vector + Kokkos::parallel_for( range_policy( 0 , nrows ) , KOKKOS_LAMBDA( const int i ) { y( i ) = 1; } ); + + // Initialize x vector + Kokkos::parallel_for( range_policy( 0 , ncols ) , KOKKOS_LAMBDA( const int i ) { x( i ) = 1; } ); + + typedef Kokkos::TeamPolicy<DeviceType> team_policy; + typedef typename Kokkos::TeamPolicy<DeviceType>::member_type member_type; + + // Initialize A matrix, note 2D indexing computation + Kokkos::parallel_for( team_policy( nrows , Kokkos::AUTO ) , KOKKOS_LAMBDA( const member_type& teamMember ) { + const int j = teamMember.league_rank(); + Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember , ncols ) , [&] ( const int i ) { + A( j , i ) = 1; + } ); + } ); + + // Three level parallelism kernel to force caching of vector x + ScalarType result = 0.0; + int chunk_size = 128; + Kokkos::parallel_reduce( team_policy( nrows/chunk_size , team_size , vector_length ) , KOKKOS_LAMBDA ( const member_type& teamMember , double &update ) { + const int row_start = teamMember.league_rank() * chunk_size; + const int row_end = row_start + chunk_size; + Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember , row_start , row_end ) , [&] ( const int i ) { + ScalarType sum_i = 0.0; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( teamMember , ncols ) , [&] ( const int j , ScalarType &innerUpdate ) { + innerUpdate += A( i , j ) * x( j ); + } , sum_i ); + Kokkos::single( Kokkos::PerThread( teamMember ) , [&] () { + update += y( i ) * sum_i; + } ); + } ); + } , result ); + + const ScalarType solution= ( ScalarType ) nrows * ( ScalarType ) ncols; + ASSERT_EQ( solution , result ); + } +}; + +#else /* #if ( ! defined( KOKKOS_HAVE_CUDA ) ) || defined( KOKKOS_CUDA_USE_LAMBDA ) */ + +template< typename ScalarType , class DeviceType > +class TestTripleNestedReduce +{ +public: + typedef DeviceType execution_space ; + typedef typename execution_space::size_type size_type ; + + TestTripleNestedReduce( const size_type & , const size_type + , const size_type & , const size_type ) + { } +}; + +#endif + +//-------------------------------------------------------------------------- + +namespace Test { +namespace ReduceCombinatorical { + +template<class Scalar,class Space = Kokkos::HostSpace> +struct AddPlus { +public: + //Required + typedef AddPlus reducer_type; + typedef Scalar value_type; + + typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type; + +private: + result_view_type result; + +public: + + AddPlus(value_type& result_):result(&result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest += src + 1; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest += src + 1; + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val = value_type(); + } + + result_view_type result_view() const { + return result; + } +}; + +template<int ISTEAM> +struct FunctorScalar; + +template<> +struct FunctorScalar<0>{ + FunctorScalar(Kokkos::View<double> r):result(r) {} + Kokkos::View<double> result; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i,double& update) const { + update+=i; + } +}; + +template<> +struct FunctorScalar<1>{ + FunctorScalar(Kokkos::View<double> r):result(r) {} + Kokkos::View<double> result; + + typedef Kokkos::TeamPolicy<>::member_type team_type; + KOKKOS_INLINE_FUNCTION + void operator() (const team_type& team,double& update) const { + update+=1.0/team.team_size()*team.league_rank(); + } +}; + +template<int ISTEAM> +struct FunctorScalarInit; + +template<> +struct FunctorScalarInit<0> { + FunctorScalarInit(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, double& update) const { + update += i; + } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { + update = 0.0; + } +}; + +template<> +struct FunctorScalarInit<1> { + FunctorScalarInit(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + + typedef Kokkos::TeamPolicy<>::member_type team_type; + KOKKOS_INLINE_FUNCTION + void operator() (const team_type& team,double& update) const { + update+=1.0/team.team_size()*team.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { + update = 0.0; + } +}; + +template<int ISTEAM> +struct FunctorScalarFinal; + + +template<> +struct FunctorScalarFinal<0> { + FunctorScalarFinal(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, double& update) const { + update += i; + } + + KOKKOS_INLINE_FUNCTION + void final(double& update) const { + result() = update; + } +}; + +template<> +struct FunctorScalarFinal<1> { + FunctorScalarFinal(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + + typedef Kokkos::TeamPolicy<>::member_type team_type; + + KOKKOS_INLINE_FUNCTION + void operator() (const team_type& team, double& update) const { + update+=1.0/team.team_size()*team.league_rank(); + } + KOKKOS_INLINE_FUNCTION + void final(double& update) const { + result() = update; + } +}; + +template<int ISTEAM> +struct FunctorScalarJoin; + +template<> +struct FunctorScalarJoin<0> { + FunctorScalarJoin(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, double& update) const { + update += i; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } +}; + +template<> +struct FunctorScalarJoin<1> { + FunctorScalarJoin(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + + typedef Kokkos::TeamPolicy<>::member_type team_type; + KOKKOS_INLINE_FUNCTION + void operator() (const team_type& team,double& update) const { + update+=1.0/team.team_size()*team.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } +}; + +template<int ISTEAM> +struct FunctorScalarJoinFinal; + +template<> +struct FunctorScalarJoinFinal<0> { + FunctorScalarJoinFinal(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, double& update) const { + update += i; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void final(double& update) const { + result() = update; + } +}; + +template<> +struct FunctorScalarJoinFinal<1> { + FunctorScalarJoinFinal(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + + typedef Kokkos::TeamPolicy<>::member_type team_type; + KOKKOS_INLINE_FUNCTION + void operator() (const team_type& team,double& update) const { + update+=1.0/team.team_size()*team.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void final(double& update) const { + result() = update; + } +}; + +template<int ISTEAM> +struct FunctorScalarJoinInit; + +template<> +struct FunctorScalarJoinInit<0> { + FunctorScalarJoinInit(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, double& update) const { + update += i; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { + update = 0.0; + } +}; + +template<> +struct FunctorScalarJoinInit<1> { + FunctorScalarJoinInit(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + + typedef Kokkos::TeamPolicy<>::member_type team_type; + KOKKOS_INLINE_FUNCTION + void operator() (const team_type& team,double& update) const { + update+=1.0/team.team_size()*team.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { + update = 0.0; + } +}; + +template<int ISTEAM> +struct FunctorScalarJoinFinalInit; + +template<> +struct FunctorScalarJoinFinalInit<0> { + FunctorScalarJoinFinalInit(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, double& update) const { + update += i; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void final(double& update) const { + result() = update; + } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { + update = 0.0; + } +}; + +template<> +struct FunctorScalarJoinFinalInit<1> { + FunctorScalarJoinFinalInit(Kokkos::View<double> r):result(r) {} + + Kokkos::View<double> result; + + typedef Kokkos::TeamPolicy<>::member_type team_type; + KOKKOS_INLINE_FUNCTION + void operator() (const team_type& team,double& update) const { + update+=1.0/team.team_size()*team.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void final(double& update) const { + result() = update; + } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { + update = 0.0; + } +}; +struct Functor1 { + KOKKOS_INLINE_FUNCTION + void operator() (const int& i,double& update) const { + update+=i; + } +}; + +struct Functor2 { + typedef double value_type[]; + const unsigned value_count; + + Functor2(unsigned n):value_count(n){} + + KOKKOS_INLINE_FUNCTION + void operator() (const unsigned& i,double update[]) const { + for(unsigned j=0;j<value_count;j++) + update[j]+=i; + } + + KOKKOS_INLINE_FUNCTION + void init( double dst[] ) const + { + for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] = 0 ; + } + + KOKKOS_INLINE_FUNCTION + void join( volatile double dst[] , + const volatile double src[] ) const + { + for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] += src[i] ; + } +}; + +} +} + +namespace Test { + +template<class ExecSpace = Kokkos::DefaultExecutionSpace> +struct TestReduceCombinatoricalInstantiation { + template<class ... Args> + static void CallParallelReduce(Args... args) { + Kokkos::parallel_reduce(args...); + } + + template<class ... Args> + static void AddReturnArgument(Args... args) { + Kokkos::View<double,Kokkos::HostSpace> result_view("ResultView"); + double expected_result = 1000.0*999.0/2.0; + + double value = 0; + Kokkos::parallel_reduce(args...,value); + ASSERT_EQ(expected_result,value); + + result_view() = 0; + CallParallelReduce(args...,result_view); + ASSERT_EQ(expected_result,result_view()); + + value = 0; + CallParallelReduce(args...,Kokkos::View<double,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(&value)); + ASSERT_EQ(expected_result,value); + + result_view() = 0; + const Kokkos::View<double,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> result_view_const_um = result_view; + CallParallelReduce(args...,result_view_const_um); + ASSERT_EQ(expected_result,result_view_const_um()); + + value = 0; + CallParallelReduce(args...,Test::ReduceCombinatorical::AddPlus<double>(value)); + if((Kokkos::DefaultExecutionSpace::concurrency() > 1) && (ExecSpace::concurrency()>1)) + ASSERT_TRUE(expected_result<value); + else if((Kokkos::DefaultExecutionSpace::concurrency() > 1) || (ExecSpace::concurrency()>1)) + ASSERT_TRUE(expected_result<=value); + else + ASSERT_EQ(expected_result,value); + + value = 0; + Test::ReduceCombinatorical::AddPlus<double> add(value); + CallParallelReduce(args...,add); + if((Kokkos::DefaultExecutionSpace::concurrency() > 1) && (ExecSpace::concurrency()>1)) + ASSERT_TRUE(expected_result<value); + else if((Kokkos::DefaultExecutionSpace::concurrency() > 1) || (ExecSpace::concurrency()>1)) + ASSERT_TRUE(expected_result<=value); + else + ASSERT_EQ(expected_result,value); + } + + + template<class ... Args> + static void AddLambdaRange(void*,Args... args) { + AddReturnArgument(args..., KOKKOS_LAMBDA (const int&i , double& lsum) { + lsum += i; + }); + } + + template<class ... Args> + static void AddLambdaTeam(void*,Args... args) { + AddReturnArgument(args..., KOKKOS_LAMBDA (const Kokkos::TeamPolicy<>::member_type& team, double& update) { + update+=1.0/team.team_size()*team.league_rank(); + }); + } + + template<class ... Args> + static void AddLambdaRange(Kokkos::InvalidType,Args... args) { + } + + template<class ... Args> + static void AddLambdaTeam(Kokkos::InvalidType,Args... args) { + } + + template<int ISTEAM, class ... Args> + static void AddFunctor(Args... args) { + Kokkos::View<double> result_view("FunctorView"); + auto h_r = Kokkos::create_mirror_view(result_view); + Test::ReduceCombinatorical::FunctorScalar<ISTEAM> functor(result_view); + double expected_result = 1000.0*999.0/2.0; + + AddReturnArgument(args..., functor); + AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalar<ISTEAM>(result_view)); + AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarInit<ISTEAM>(result_view)); + AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarJoin<ISTEAM>(result_view)); + AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarJoinInit<ISTEAM>(result_view)); + + h_r() = 0; + Kokkos::deep_copy(result_view,h_r); + CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarFinal<ISTEAM>(result_view)); + Kokkos::deep_copy(h_r,result_view); + ASSERT_EQ(expected_result,h_r()); + + h_r() = 0; + Kokkos::deep_copy(result_view,h_r); + CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarJoinFinal<ISTEAM>(result_view)); + Kokkos::deep_copy(h_r,result_view); + ASSERT_EQ(expected_result,h_r()); + + h_r() = 0; + Kokkos::deep_copy(result_view,h_r); + CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarJoinFinalInit<ISTEAM>(result_view)); + Kokkos::deep_copy(h_r,result_view); + ASSERT_EQ(expected_result,h_r()); + } + + template<class ... Args> + static void AddFunctorLambdaRange(Args... args) { + AddFunctor<0,Args...>(args...); + #ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA + AddLambdaRange(typename std::conditional<std::is_same<ExecSpace,Kokkos::DefaultExecutionSpace>::value,void*,Kokkos::InvalidType>::type(), args...); + #endif + } + + template<class ... Args> + static void AddFunctorLambdaTeam(Args... args) { + AddFunctor<1,Args...>(args...); + #ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA + AddLambdaTeam(typename std::conditional<std::is_same<ExecSpace,Kokkos::DefaultExecutionSpace>::value,void*,Kokkos::InvalidType>::type(), args...); + #endif + } + + template<class ... Args> + static void AddPolicy(Args... args) { + int N = 1000; + Kokkos::RangePolicy<ExecSpace> policy(0,N); + + AddFunctorLambdaRange(args...,1000); + AddFunctorLambdaRange(args...,N); + AddFunctorLambdaRange(args...,policy); + AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace>(0,N)); + AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(0,N)); + AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Static> >(0,N).set_chunk_size(10)); + AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(0,N).set_chunk_size(10)); + + AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace>(N,Kokkos::AUTO)); + AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(N,Kokkos::AUTO)); + AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Static> >(N,Kokkos::AUTO).set_chunk_size(10)); + AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(N,Kokkos::AUTO).set_chunk_size(10)); + } + + + static void AddLabel() { + std::string s("Std::String"); + AddPolicy(); + AddPolicy("Char Constant"); + AddPolicy(s.c_str()); + AddPolicy(s); + } + + static void execute() { + AddLabel(); + } +}; + +template<class Scalar, class ExecSpace = Kokkos::DefaultExecutionSpace> +struct TestReducers { + + struct SumFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, Scalar& value) const { + value += values(i); + } + }; + + struct ProdFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, Scalar& value) const { + value *= values(i); + } + }; + + struct MinFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, Scalar& value) const { + if(values(i) < value) + value = values(i); + } + }; + + struct MaxFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, Scalar& value) const { + if(values(i) > value) + value = values(i); + } + }; + + struct MinLocFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, + typename Kokkos::Experimental::MinLoc<Scalar,int>::value_type& value) const { + if(values(i) < value.val) { + value.val = values(i); + value.loc = i; + } + } + }; + + struct MaxLocFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, + typename Kokkos::Experimental::MaxLoc<Scalar,int>::value_type& value) const { + if(values(i) > value.val) { + value.val = values(i); + value.loc = i; + } + } + }; + + struct MinMaxLocFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, + typename Kokkos::Experimental::MinMaxLoc<Scalar,int>::value_type& value) const { + if(values(i) > value.max_val) { + value.max_val = values(i); + value.max_loc = i; + } + if(values(i) < value.min_val) { + value.min_val = values(i); + value.min_loc = i; + } + } + }; + + struct BAndFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, Scalar& value) const { + value = value & values(i); + } + }; + + struct BOrFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, Scalar& value) const { + value = value | values(i); + } + }; + + struct BXorFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, Scalar& value) const { + value = value ^ values(i); + } + }; + + struct LAndFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, Scalar& value) const { + value = value && values(i); + } + }; + + struct LOrFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, Scalar& value) const { + value = value || values(i); + } + }; + + struct LXorFunctor { + Kokkos::View<const Scalar*,ExecSpace> values; + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, Scalar& value) const { + value = value ? (!values(i)) : values(i); + } + }; + + static void test_sum(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_sum = 0; + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)(rand()%100); + reference_sum += h_values(i); + } + Kokkos::deep_copy(values,h_values); + + SumFunctor f; + f.values = values; + Scalar init = 0; + + { + Scalar sum_scalar = init; + Kokkos::Experimental::Sum<Scalar> reducer_scalar(sum_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(sum_scalar,reference_sum); + Scalar sum_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(sum_scalar_view,reference_sum); + } + { + Scalar sum_scalar_init = init; + Kokkos::Experimental::Sum<Scalar> reducer_scalar_init(sum_scalar_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init); + ASSERT_EQ(sum_scalar_init,reference_sum); + Scalar sum_scalar_init_view = reducer_scalar_init.result_view()(); + ASSERT_EQ(sum_scalar_init_view,reference_sum); + } + { + Kokkos::View<Scalar,Kokkos::HostSpace> sum_view("View"); + sum_view() = init; + Kokkos::Experimental::Sum<Scalar> reducer_view(sum_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + Scalar sum_view_scalar = sum_view(); + ASSERT_EQ(sum_view_scalar,reference_sum); + Scalar sum_view_view = reducer_view.result_view()(); + ASSERT_EQ(sum_view_view,reference_sum); + } + { + Kokkos::View<Scalar,Kokkos::HostSpace> sum_view_init("View"); + sum_view_init() = init; + Kokkos::Experimental::Sum<Scalar> reducer_view_init(sum_view_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init); + Scalar sum_view_init_scalar = sum_view_init(); + ASSERT_EQ(sum_view_init_scalar,reference_sum); + Scalar sum_view_init_view = reducer_view_init.result_view()(); + ASSERT_EQ(sum_view_init_view,reference_sum); + } + } + + static void test_prod(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_prod = 1; + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)(rand()%4+1); + reference_prod *= h_values(i); + } + Kokkos::deep_copy(values,h_values); + + ProdFunctor f; + f.values = values; + Scalar init = 1; + + if(std::is_arithmetic<Scalar>::value) + { + Scalar prod_scalar = init; + Kokkos::Experimental::Prod<Scalar> reducer_scalar(prod_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(prod_scalar,reference_prod); + Scalar prod_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(prod_scalar_view,reference_prod); + } + { + Scalar prod_scalar_init = init; + Kokkos::Experimental::Prod<Scalar> reducer_scalar_init(prod_scalar_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init); + ASSERT_EQ(prod_scalar_init,reference_prod); + Scalar prod_scalar_init_view = reducer_scalar_init.result_view()(); + ASSERT_EQ(prod_scalar_init_view,reference_prod); + } + + if(std::is_arithmetic<Scalar>::value) + { + Kokkos::View<Scalar,Kokkos::HostSpace> prod_view("View"); + prod_view() = init; + Kokkos::Experimental::Prod<Scalar> reducer_view(prod_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + Scalar prod_view_scalar = prod_view(); + ASSERT_EQ(prod_view_scalar,reference_prod); + Scalar prod_view_view = reducer_view.result_view()(); + ASSERT_EQ(prod_view_view,reference_prod); + } + { + Kokkos::View<Scalar,Kokkos::HostSpace> prod_view_init("View"); + prod_view_init() = init; + Kokkos::Experimental::Prod<Scalar> reducer_view_init(prod_view_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init); + Scalar prod_view_init_scalar = prod_view_init(); + ASSERT_EQ(prod_view_init_scalar,reference_prod); + Scalar prod_view_init_view = reducer_view_init.result_view()(); + ASSERT_EQ(prod_view_init_view,reference_prod); + } + } + + static void test_min(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_min = std::numeric_limits<Scalar>::max(); + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)(rand()%100000); + if(h_values(i)<reference_min) + reference_min = h_values(i); + } + Kokkos::deep_copy(values,h_values); + + MinFunctor f; + f.values = values; + Scalar init = std::numeric_limits<Scalar>::max(); + + { + Scalar min_scalar = init; + Kokkos::Experimental::Min<Scalar> reducer_scalar(min_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(min_scalar,reference_min); + Scalar min_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(min_scalar_view,reference_min); + } + { + Scalar min_scalar_init = init; + Kokkos::Experimental::Min<Scalar> reducer_scalar_init(min_scalar_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init); + ASSERT_EQ(min_scalar_init,reference_min); + Scalar min_scalar_init_view = reducer_scalar_init.result_view()(); + ASSERT_EQ(min_scalar_init_view,reference_min); + } + { + Kokkos::View<Scalar,Kokkos::HostSpace> min_view("View"); + min_view() = init; + Kokkos::Experimental::Min<Scalar> reducer_view(min_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + Scalar min_view_scalar = min_view(); + ASSERT_EQ(min_view_scalar,reference_min); + Scalar min_view_view = reducer_view.result_view()(); + ASSERT_EQ(min_view_view,reference_min); + } + { + Kokkos::View<Scalar,Kokkos::HostSpace> min_view_init("View"); + min_view_init() = init; + Kokkos::Experimental::Min<Scalar> reducer_view_init(min_view_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init); + Scalar min_view_init_scalar = min_view_init(); + ASSERT_EQ(min_view_init_scalar,reference_min); + Scalar min_view_init_view = reducer_view_init.result_view()(); + ASSERT_EQ(min_view_init_view,reference_min); + } + } + + static void test_max(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_max = std::numeric_limits<Scalar>::min(); + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)(rand()%100000+1); + if(h_values(i)>reference_max) + reference_max = h_values(i); + } + Kokkos::deep_copy(values,h_values); + + MaxFunctor f; + f.values = values; + Scalar init = std::numeric_limits<Scalar>::min(); + + { + Scalar max_scalar = init; + Kokkos::Experimental::Max<Scalar> reducer_scalar(max_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(max_scalar,reference_max); + Scalar max_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(max_scalar_view,reference_max); + } + { + Scalar max_scalar_init = init; + Kokkos::Experimental::Max<Scalar> reducer_scalar_init(max_scalar_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init); + ASSERT_EQ(max_scalar_init,reference_max); + Scalar max_scalar_init_view = reducer_scalar_init.result_view()(); + ASSERT_EQ(max_scalar_init_view,reference_max); + } + { + Kokkos::View<Scalar,Kokkos::HostSpace> max_view("View"); + max_view() = init; + Kokkos::Experimental::Max<Scalar> reducer_view(max_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + Scalar max_view_scalar = max_view(); + ASSERT_EQ(max_view_scalar,reference_max); + Scalar max_view_view = reducer_view.result_view()(); + ASSERT_EQ(max_view_view,reference_max); + } + { + Kokkos::View<Scalar,Kokkos::HostSpace> max_view_init("View"); + max_view_init() = init; + Kokkos::Experimental::Max<Scalar> reducer_view_init(max_view_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init); + Scalar max_view_init_scalar = max_view_init(); + ASSERT_EQ(max_view_init_scalar,reference_max); + Scalar max_view_init_view = reducer_view_init.result_view()(); + ASSERT_EQ(max_view_init_view,reference_max); + } + } + + static void test_minloc(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_min = std::numeric_limits<Scalar>::max(); + int reference_loc = -1; + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)(rand()%100000); + if(h_values(i)<reference_min) { + reference_min = h_values(i); + reference_loc = i; + } + } + Kokkos::deep_copy(values,h_values); + + MinLocFunctor f; + typedef typename Kokkos::Experimental::MinLoc<Scalar,int>::value_type value_type; + f.values = values; + Scalar init = std::numeric_limits<Scalar>::max(); + + + { + value_type min_scalar; + Kokkos::Experimental::MinLoc<Scalar,int> reducer_scalar(min_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(min_scalar.val,reference_min); + ASSERT_EQ(min_scalar.loc,reference_loc); + value_type min_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(min_scalar_view.val,reference_min); + ASSERT_EQ(min_scalar_view.loc,reference_loc); + } + { + value_type min_scalar_init; + Kokkos::Experimental::MinLoc<Scalar,int> reducer_scalar_init(min_scalar_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init); + ASSERT_EQ(min_scalar_init.val,reference_min); + ASSERT_EQ(min_scalar_init.loc,reference_loc); + value_type min_scalar_init_view = reducer_scalar_init.result_view()(); + ASSERT_EQ(min_scalar_init_view.val,reference_min); + ASSERT_EQ(min_scalar_init_view.loc,reference_loc); + } + { + Kokkos::View<value_type,Kokkos::HostSpace> min_view("View"); + Kokkos::Experimental::MinLoc<Scalar,int> reducer_view(min_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + value_type min_view_scalar = min_view(); + ASSERT_EQ(min_view_scalar.val,reference_min); + ASSERT_EQ(min_view_scalar.loc,reference_loc); + value_type min_view_view = reducer_view.result_view()(); + ASSERT_EQ(min_view_view.val,reference_min); + ASSERT_EQ(min_view_view.loc,reference_loc); + } + { + Kokkos::View<value_type,Kokkos::HostSpace> min_view_init("View"); + Kokkos::Experimental::MinLoc<Scalar,int> reducer_view_init(min_view_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init); + value_type min_view_init_scalar = min_view_init(); + ASSERT_EQ(min_view_init_scalar.val,reference_min); + ASSERT_EQ(min_view_init_scalar.loc,reference_loc); + value_type min_view_init_view = reducer_view_init.result_view()(); + ASSERT_EQ(min_view_init_view.val,reference_min); + ASSERT_EQ(min_view_init_view.loc,reference_loc); + } + } + + static void test_maxloc(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_max = std::numeric_limits<Scalar>::min(); + int reference_loc = -1; + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)(rand()%100000); + if(h_values(i)>reference_max) { + reference_max = h_values(i); + reference_loc = i; + } + } + Kokkos::deep_copy(values,h_values); + + MaxLocFunctor f; + typedef typename Kokkos::Experimental::MaxLoc<Scalar,int>::value_type value_type; + f.values = values; + Scalar init = std::numeric_limits<Scalar>::min(); + + + { + value_type max_scalar; + Kokkos::Experimental::MaxLoc<Scalar,int> reducer_scalar(max_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(max_scalar.val,reference_max); + ASSERT_EQ(max_scalar.loc,reference_loc); + value_type max_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(max_scalar_view.val,reference_max); + ASSERT_EQ(max_scalar_view.loc,reference_loc); + } + { + value_type max_scalar_init; + Kokkos::Experimental::MaxLoc<Scalar,int> reducer_scalar_init(max_scalar_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init); + ASSERT_EQ(max_scalar_init.val,reference_max); + ASSERT_EQ(max_scalar_init.loc,reference_loc); + value_type max_scalar_init_view = reducer_scalar_init.result_view()(); + ASSERT_EQ(max_scalar_init_view.val,reference_max); + ASSERT_EQ(max_scalar_init_view.loc,reference_loc); + } + { + Kokkos::View<value_type,Kokkos::HostSpace> max_view("View"); + Kokkos::Experimental::MaxLoc<Scalar,int> reducer_view(max_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + value_type max_view_scalar = max_view(); + ASSERT_EQ(max_view_scalar.val,reference_max); + ASSERT_EQ(max_view_scalar.loc,reference_loc); + value_type max_view_view = reducer_view.result_view()(); + ASSERT_EQ(max_view_view.val,reference_max); + ASSERT_EQ(max_view_view.loc,reference_loc); + } + { + Kokkos::View<value_type,Kokkos::HostSpace> max_view_init("View"); + Kokkos::Experimental::MaxLoc<Scalar,int> reducer_view_init(max_view_init,init); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init); + value_type max_view_init_scalar = max_view_init(); + ASSERT_EQ(max_view_init_scalar.val,reference_max); + ASSERT_EQ(max_view_init_scalar.loc,reference_loc); + value_type max_view_init_view = reducer_view_init.result_view()(); + ASSERT_EQ(max_view_init_view.val,reference_max); + ASSERT_EQ(max_view_init_view.loc,reference_loc); + } + } + + static void test_minmaxloc(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_max = std::numeric_limits<Scalar>::min(); + Scalar reference_min = std::numeric_limits<Scalar>::max(); + int reference_minloc = -1; + int reference_maxloc = -1; + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)(rand()%100000); + if(h_values(i)>reference_max) { + reference_max = h_values(i); + reference_maxloc = i; + } + if(h_values(i)<reference_min) { + reference_min = h_values(i); + reference_minloc = i; + } + } + Kokkos::deep_copy(values,h_values); + + MinMaxLocFunctor f; + typedef typename Kokkos::Experimental::MinMaxLoc<Scalar,int>::value_type value_type; + f.values = values; + Scalar init_min = std::numeric_limits<Scalar>::max(); + Scalar init_max = std::numeric_limits<Scalar>::min(); + + + { + value_type minmax_scalar; + Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_scalar(minmax_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(minmax_scalar.min_val,reference_min); + ASSERT_EQ(minmax_scalar.min_loc,reference_minloc); + ASSERT_EQ(minmax_scalar.max_val,reference_max); + ASSERT_EQ(minmax_scalar.max_loc,reference_maxloc); + value_type minmax_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(minmax_scalar_view.min_val,reference_min); + ASSERT_EQ(minmax_scalar_view.min_loc,reference_minloc); + ASSERT_EQ(minmax_scalar_view.max_val,reference_max); + ASSERT_EQ(minmax_scalar_view.max_loc,reference_maxloc); + } + { + value_type minmax_scalar_init; + Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_scalar_init(minmax_scalar_init,init_min,init_max); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init); + ASSERT_EQ(minmax_scalar_init.min_val,reference_min); + ASSERT_EQ(minmax_scalar_init.min_loc,reference_minloc); + ASSERT_EQ(minmax_scalar_init.max_val,reference_max); + ASSERT_EQ(minmax_scalar_init.max_loc,reference_maxloc); + value_type minmax_scalar_init_view = reducer_scalar_init.result_view()(); + ASSERT_EQ(minmax_scalar_init_view.min_val,reference_min); + ASSERT_EQ(minmax_scalar_init_view.min_loc,reference_minloc); + ASSERT_EQ(minmax_scalar_init_view.max_val,reference_max); + ASSERT_EQ(minmax_scalar_init_view.max_loc,reference_maxloc); + } + { + Kokkos::View<value_type,Kokkos::HostSpace> minmax_view("View"); + Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_view(minmax_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + value_type minmax_view_scalar = minmax_view(); + ASSERT_EQ(minmax_view_scalar.min_val,reference_min); + ASSERT_EQ(minmax_view_scalar.min_loc,reference_minloc); + ASSERT_EQ(minmax_view_scalar.max_val,reference_max); + ASSERT_EQ(minmax_view_scalar.max_loc,reference_maxloc); + value_type minmax_view_view = reducer_view.result_view()(); + ASSERT_EQ(minmax_view_view.min_val,reference_min); + ASSERT_EQ(minmax_view_view.min_loc,reference_minloc); + ASSERT_EQ(minmax_view_view.max_val,reference_max); + ASSERT_EQ(minmax_view_view.max_loc,reference_maxloc); + } + { + Kokkos::View<value_type,Kokkos::HostSpace> minmax_view_init("View"); + Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_view_init(minmax_view_init,init_min,init_max); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init); + value_type minmax_view_init_scalar = minmax_view_init(); + ASSERT_EQ(minmax_view_init_scalar.min_val,reference_min); + ASSERT_EQ(minmax_view_init_scalar.min_loc,reference_minloc); + ASSERT_EQ(minmax_view_init_scalar.max_val,reference_max); + ASSERT_EQ(minmax_view_init_scalar.max_loc,reference_maxloc); + value_type minmax_view_init_view = reducer_view_init.result_view()(); + ASSERT_EQ(minmax_view_init_view.min_val,reference_min); + ASSERT_EQ(minmax_view_init_view.min_loc,reference_minloc); + ASSERT_EQ(minmax_view_init_view.max_val,reference_max); + ASSERT_EQ(minmax_view_init_view.max_loc,reference_maxloc); + } + } + + static void test_BAnd(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_band = Scalar() | (~Scalar()); + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)(rand()%100000+1); + reference_band = reference_band & h_values(i); + } + Kokkos::deep_copy(values,h_values); + + BAndFunctor f; + f.values = values; + Scalar init = Scalar() | (~Scalar()); + + { + Scalar band_scalar = init; + Kokkos::Experimental::BAnd<Scalar> reducer_scalar(band_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(band_scalar,reference_band); + Scalar band_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(band_scalar_view,reference_band); + } + + { + Kokkos::View<Scalar,Kokkos::HostSpace> band_view("View"); + band_view() = init; + Kokkos::Experimental::BAnd<Scalar> reducer_view(band_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + Scalar band_view_scalar = band_view(); + ASSERT_EQ(band_view_scalar,reference_band); + Scalar band_view_view = reducer_view.result_view()(); + ASSERT_EQ(band_view_view,reference_band); + } + } + + static void test_BOr(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_bor = Scalar() & (~Scalar()); + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)((rand()%100000+1)*2); + reference_bor = reference_bor | h_values(i); + } + Kokkos::deep_copy(values,h_values); + + BOrFunctor f; + f.values = values; + Scalar init = Scalar() & (~Scalar()); + + { + Scalar bor_scalar = init; + Kokkos::Experimental::BOr<Scalar> reducer_scalar(bor_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(bor_scalar,reference_bor); + Scalar bor_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(bor_scalar_view,reference_bor); + } + + { + Kokkos::View<Scalar,Kokkos::HostSpace> bor_view("View"); + bor_view() = init; + Kokkos::Experimental::BOr<Scalar> reducer_view(bor_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + Scalar bor_view_scalar = bor_view(); + ASSERT_EQ(bor_view_scalar,reference_bor); + Scalar bor_view_view = reducer_view.result_view()(); + ASSERT_EQ(bor_view_view,reference_bor); + } + } + + static void test_BXor(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_bxor = Scalar() & (~Scalar()); + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)((rand()%100000+1)*2); + reference_bxor = reference_bxor ^ h_values(i); + } + Kokkos::deep_copy(values,h_values); + + BXorFunctor f; + f.values = values; + Scalar init = Scalar() & (~Scalar()); + + { + Scalar bxor_scalar = init; + Kokkos::Experimental::BXor<Scalar> reducer_scalar(bxor_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(bxor_scalar,reference_bxor); + Scalar bxor_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(bxor_scalar_view,reference_bxor); + } + + { + Kokkos::View<Scalar,Kokkos::HostSpace> bxor_view("View"); + bxor_view() = init; + Kokkos::Experimental::BXor<Scalar> reducer_view(bxor_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + Scalar bxor_view_scalar = bxor_view(); + ASSERT_EQ(bxor_view_scalar,reference_bxor); + Scalar bxor_view_view = reducer_view.result_view()(); + ASSERT_EQ(bxor_view_view,reference_bxor); + } + } + static void test_LAnd(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_land = 1; + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)(rand()%2); + reference_land = reference_land && h_values(i); + } + Kokkos::deep_copy(values,h_values); + + LAndFunctor f; + f.values = values; + Scalar init = 1; + + { + Scalar land_scalar = init; + Kokkos::Experimental::LAnd<Scalar> reducer_scalar(land_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(land_scalar,reference_land); + Scalar land_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(land_scalar_view,reference_land); + } + + { + Kokkos::View<Scalar,Kokkos::HostSpace> land_view("View"); + land_view() = init; + Kokkos::Experimental::LAnd<Scalar> reducer_view(land_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + Scalar land_view_scalar = land_view(); + ASSERT_EQ(land_view_scalar,reference_land); + Scalar land_view_view = reducer_view.result_view()(); + ASSERT_EQ(land_view_view,reference_land); + } + } + + static void test_LOr(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_lor = 0; + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)(rand()%2); + reference_lor = reference_lor || h_values(i); + } + Kokkos::deep_copy(values,h_values); + + LOrFunctor f; + f.values = values; + Scalar init = 0; + + { + Scalar lor_scalar = init; + Kokkos::Experimental::LOr<Scalar> reducer_scalar(lor_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(lor_scalar,reference_lor); + Scalar lor_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(lor_scalar_view,reference_lor); + } + + { + Kokkos::View<Scalar,Kokkos::HostSpace> lor_view("View"); + lor_view() = init; + Kokkos::Experimental::LOr<Scalar> reducer_view(lor_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + Scalar lor_view_scalar = lor_view(); + ASSERT_EQ(lor_view_scalar,reference_lor); + Scalar lor_view_view = reducer_view.result_view()(); + ASSERT_EQ(lor_view_view,reference_lor); + } + } + + static void test_LXor(int N) { + Kokkos::View<Scalar*,ExecSpace> values("Values",N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_lxor = 0; + for(int i=0; i<N; i++) { + h_values(i) = (Scalar)(rand()%2); + reference_lxor = reference_lxor ? (!h_values(i)) : h_values(i); + } + Kokkos::deep_copy(values,h_values); + + LXorFunctor f; + f.values = values; + Scalar init = 0; + + { + Scalar lxor_scalar = init; + Kokkos::Experimental::LXor<Scalar> reducer_scalar(lxor_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar); + ASSERT_EQ(lxor_scalar,reference_lxor); + Scalar lxor_scalar_view = reducer_scalar.result_view()(); + ASSERT_EQ(lxor_scalar_view,reference_lxor); + } + + { + Kokkos::View<Scalar,Kokkos::HostSpace> lxor_view("View"); + lxor_view() = init; + Kokkos::Experimental::LXor<Scalar> reducer_view(lxor_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view); + Scalar lxor_view_scalar = lxor_view(); + ASSERT_EQ(lxor_view_scalar,reference_lxor); + Scalar lxor_view_view = reducer_view.result_view()(); + ASSERT_EQ(lxor_view_view,reference_lxor); + } + } + + static void execute_float() { + test_sum(10001); + test_prod(35); + test_min(10003); + test_minloc(10003); + test_max(10007); + test_maxloc(10007); + test_minmaxloc(10007); + } + + static void execute_integer() { + test_sum(10001); + test_prod(35); + test_min(10003); + test_minloc(10003); + test_max(10007); + test_maxloc(10007); + test_minmaxloc(10007); + test_BAnd(35); + test_BOr(35); + test_BXor(35); + test_LAnd(35); + test_LOr(35); + test_LXor(35); + } + + static void execute_basic() { + test_sum(10001); + test_prod(35); + } +}; } /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/unit_test/TestSerial.cpp b/lib/kokkos/core/unit_test/TestSerial.cpp index 7ddb54241c..d85614e66e 100644 --- a/lib/kokkos/core/unit_test/TestSerial.cpp +++ b/lib/kokkos/core/unit_test/TestSerial.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -66,6 +66,7 @@ #include <TestViewOfClass.hpp> #include <TestViewSubview.hpp> #include <TestAtomic.hpp> +#include <TestAtomicOperations.hpp> #include <TestRange.hpp> #include <TestTeam.hpp> #include <TestReduce.hpp> @@ -85,6 +86,8 @@ #include <TestPolicyConstruction.hpp> +#include <TestMDRange.hpp> + namespace Test { class serial : public ::testing::Test { @@ -99,6 +102,12 @@ protected: } }; +TEST_F( serial , md_range ) { + TestMDRange_2D< Kokkos::Serial >::test_for2(100,100); + + TestMDRange_3D< Kokkos::Serial >::test_for3(100,100,100); +} + TEST_F( serial , impl_shared_alloc ) { test_shared_alloc< Kokkos::HostSpace , Kokkos::Serial >(); } @@ -199,6 +208,14 @@ TEST_F( serial, double_reduce) { TestReduce< double , Kokkos::Serial >( 1000000 ); } +TEST_F( serial , reducers ) +{ + TestReducers<int, Kokkos::Serial>::execute_integer(); + TestReducers<size_t, Kokkos::Serial>::execute_integer(); + TestReducers<double, Kokkos::Serial>::execute_float(); + TestReducers<Kokkos::complex<double>, Kokkos::Serial>::execute_basic(); +} + TEST_F( serial, long_reduce_dynamic ) { TestReduceDynamic< long , Kokkos::Serial >( 1000000 ); } @@ -237,13 +254,17 @@ TEST_F( serial , team_shared_request) { TestSharedTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >(); } -#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) +#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) TEST_F( serial , team_lambda_shared_request) { TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >(); TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >(); } #endif +TEST_F( serial, shmem_size) { + TestShmemSize< Kokkos::Serial >(); +} + TEST_F( serial , team_scan ) { TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 10 ); @@ -345,6 +366,74 @@ TEST_F( serial , atomics ) ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Serial>(100,3) ) ); } +TEST_F( serial , atomic_operations ) +{ + const int start = 1; //Avoid zero for division + const int end = 11; + for (int i = start; i < end; ++i) + { + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 4 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 4 ) ) ); + } + +} //---------------------------------------------------------------------------- TEST_F( serial, tile_layout ) @@ -391,12 +480,36 @@ TEST_F( serial , memory_pool ) bool val = TestMemoryPool::test_mempool< Kokkos::Serial >( 128, 128000000 ); ASSERT_TRUE( val ); - TestMemoryPool::test_mempool2< Kokkos::Serial >( 128, 128000000 ); + TestMemoryPool::test_mempool2< Kokkos::Serial >( 64, 4, 1000000, 2000000 ); + + TestMemoryPool::test_memory_exhaustion< Kokkos::Serial >(); } //---------------------------------------------------------------------------- -TEST_F( serial , task_policy ) +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + +TEST_F( serial , task_fib ) +{ + for ( int i = 0 ; i < 25 ; ++i ) { + TestTaskPolicy::TestFib< Kokkos::Serial >::run(i); + } +} + +TEST_F( serial , task_depend ) +{ + for ( int i = 0 ; i < 25 ; ++i ) { + TestTaskPolicy::TestTaskDependence< Kokkos::Serial >::run(i); + } +} + +TEST_F( serial , task_team ) +{ + TestTaskPolicy::TestTaskTeam< Kokkos::Serial >::run(1000); + //TestTaskPolicy::TestTaskTeamValue< Kokkos::Serial >::run(1000); //put back after testing +} + +TEST_F( serial , old_task_policy ) { TestTaskPolicy::test_task_dep< Kokkos::Serial >( 10 ); // TestTaskPolicy::test_norm2< Kokkos::Serial >( 1000 ); @@ -406,11 +519,13 @@ TEST_F( serial , task_policy ) for ( long i = 0 ; i < 25 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Serial >(i); } -TEST_F( serial , task_team ) +TEST_F( serial , old_task_team ) { TestTaskPolicy::test_task_team< Kokkos::Serial >(1000); } +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ + //---------------------------------------------------------------------------- TEST_F( serial , template_meta_functions ) diff --git a/lib/kokkos/core/unit_test/TestTaskPolicy.hpp b/lib/kokkos/core/unit_test/TestTaskPolicy.hpp index e5c461af01..71790f6def 100644 --- a/lib/kokkos/core/unit_test/TestTaskPolicy.hpp +++ b/lib/kokkos/core/unit_test/TestTaskPolicy.hpp @@ -50,10 +50,489 @@ #include <cmath> #include <Kokkos_TaskPolicy.hpp> +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace TestTaskPolicy { + +namespace { + +long eval_fib( long n ) +{ + constexpr long mask = 0x03 ; + + long fib[4] = { 0 , 1 , 1 , 2 }; + + for ( long i = 2 ; i <= n ; ++i ) { + fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ]; + } + + return fib[ n & mask ]; +} + +} + +template< typename Space > +struct TestFib +{ + typedef Kokkos::TaskPolicy<Space> policy_type ; + typedef Kokkos::Future<long,Space> future_type ; + typedef long value_type ; + + policy_type policy ; + future_type fib_m1 ; + future_type fib_m2 ; + const value_type n ; + + KOKKOS_INLINE_FUNCTION + TestFib( const policy_type & arg_policy , const value_type arg_n ) + : policy(arg_policy) + , fib_m1() , fib_m2() + , n( arg_n ) + {} + + KOKKOS_INLINE_FUNCTION + void operator()( typename policy_type::member_type & , value_type & result ) + { +#if 0 + printf( "\nTestFib(%ld) %d %d\n" + , n + , int( ! fib_m1.is_null() ) + , int( ! fib_m2.is_null() ) + ); +#endif + + if ( n < 2 ) { + result = n ; + } + else if ( ! fib_m2.is_null() && ! fib_m1.is_null() ) { + result = fib_m1.get() + fib_m2.get(); + } + else { + + // Spawn new children and respawn myself to sum their results: + // Spawn lower value at higher priority as it has a shorter + // path to completion. + + fib_m2 = policy.task_spawn( TestFib(policy,n-2) + , Kokkos::TaskSingle + , Kokkos::TaskHighPriority ); + + fib_m1 = policy.task_spawn( TestFib(policy,n-1) + , Kokkos::TaskSingle ); + + Kokkos::Future<Space> dep[] = { fib_m1 , fib_m2 }; + + Kokkos::Future<Space> fib_all = policy.when_all( 2 , dep ); + + if ( ! fib_m2.is_null() && ! fib_m1.is_null() && ! fib_all.is_null() ) { + // High priority to retire this branch + policy.respawn( this , Kokkos::TaskHighPriority , fib_all ); + } + else { +#if 0 + printf( "TestFib(%ld) insufficient memory alloc_capacity(%d) task_max(%d) task_accum(%ld)\n" + , n + , policy.allocation_capacity() + , policy.allocated_task_count_max() + , policy.allocated_task_count_accum() + ); +#endif + Kokkos::abort("TestFib insufficient memory"); + + } + } + } + + static void run( int i , size_t MemoryCapacity = 16000 ) + { + typedef typename policy_type::memory_space memory_space ; + + enum { Log2_SuperBlockSize = 12 }; + + policy_type root_policy( memory_space() , MemoryCapacity , Log2_SuperBlockSize ); + + future_type f = root_policy.host_spawn( TestFib(root_policy,i) , Kokkos::TaskSingle ); + Kokkos::wait( root_policy ); + ASSERT_EQ( eval_fib(i) , f.get() ); + +#if 0 + fprintf( stdout , "\nTestFib::run(%d) spawn_size(%d) when_all_size(%d) alloc_capacity(%d) task_max(%d) task_accum(%ld)\n" + , i + , int(root_policy.template spawn_allocation_size<TestFib>()) + , int(root_policy.when_all_allocation_size(2)) + , root_policy.allocation_capacity() + , root_policy.allocated_task_count_max() + , root_policy.allocated_task_count_accum() + ); + fflush( stdout ); +#endif + } + +}; + +} // namespace TestTaskPolicy + +//---------------------------------------------------------------------------- + namespace TestTaskPolicy { +template< class Space > +struct TestTaskDependence { + + typedef Kokkos::TaskPolicy<Space> policy_type ; + typedef Kokkos::Future<Space> future_type ; + typedef Kokkos::View<long,Space> accum_type ; + typedef void value_type ; + + policy_type m_policy ; + accum_type m_accum ; + long m_count ; + + KOKKOS_INLINE_FUNCTION + TestTaskDependence( long n + , const policy_type & arg_policy + , const accum_type & arg_accum ) + : m_policy( arg_policy ) + , m_accum( arg_accum ) + , m_count( n ) + {} + + KOKKOS_INLINE_FUNCTION + void operator()( typename policy_type::member_type & ) + { + enum { CHUNK = 8 }; + const int n = CHUNK < m_count ? CHUNK : m_count ; + + if ( 1 < m_count ) { + future_type f[ CHUNK ] ; + + const int inc = ( m_count + n - 1 ) / n ; + + for ( int i = 0 ; i < n ; ++i ) { + long begin = i * inc ; + long count = begin + inc < m_count ? inc : m_count - begin ; + f[i] = m_policy.task_spawn( TestTaskDependence(count,m_policy,m_accum) , Kokkos::TaskSingle ); + } + + m_count = 0 ; + + m_policy.respawn( this , m_policy.when_all( n , f ) ); + } + else if ( 1 == m_count ) { + Kokkos::atomic_increment( & m_accum() ); + } + } + + static void run( int n ) + { + typedef typename policy_type::memory_space memory_space ; + + // enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool + enum { MemoryCapacity = 16000 }; + enum { Log2_SuperBlockSize = 12 }; + policy_type policy( memory_space() , MemoryCapacity , Log2_SuperBlockSize ); + + accum_type accum("accum"); + + typename accum_type::HostMirror host_accum = + Kokkos::create_mirror_view( accum ); + + policy.host_spawn( TestTaskDependence(n,policy,accum) , Kokkos::TaskSingle ); + + Kokkos::wait( policy ); + + Kokkos::deep_copy( host_accum , accum ); + + ASSERT_EQ( host_accum() , n ); + } +}; + +} // namespace TestTaskPolicy + //---------------------------------------------------------------------------- +namespace TestTaskPolicy { + +template< class ExecSpace > +struct TestTaskTeam { + + //enum { SPAN = 8 }; + enum { SPAN = 33 }; + //enum { SPAN = 1 }; + + typedef void value_type ; + typedef Kokkos::TaskPolicy<ExecSpace> policy_type ; + typedef Kokkos::Future<ExecSpace> future_type ; + typedef Kokkos::View<long*,ExecSpace> view_type ; + + policy_type policy ; + future_type future ; + + view_type parfor_result ; + view_type parreduce_check ; + view_type parscan_result ; + view_type parscan_check ; + const long nvalue ; + + KOKKOS_INLINE_FUNCTION + TestTaskTeam( const policy_type & arg_policy + , const view_type & arg_parfor_result + , const view_type & arg_parreduce_check + , const view_type & arg_parscan_result + , const view_type & arg_parscan_check + , const long arg_nvalue ) + : policy(arg_policy) + , future() + , parfor_result( arg_parfor_result ) + , parreduce_check( arg_parreduce_check ) + , parscan_result( arg_parscan_result ) + , parscan_check( arg_parscan_check ) + , nvalue( arg_nvalue ) + {} + + KOKKOS_INLINE_FUNCTION + void operator()( typename policy_type::member_type & member ) + { + const long end = nvalue + 1 ; + const long begin = 0 < end - SPAN ? end - SPAN : 0 ; + + if ( 0 < begin && future.is_null() ) { + if ( member.team_rank() == 0 ) { + future = policy.task_spawn + ( TestTaskTeam( policy , + parfor_result , + parreduce_check, + parscan_result, + parscan_check, + begin - 1 ) + , Kokkos::TaskTeam ); + + assert( ! future.is_null() ); + + policy.respawn( this , future ); + } + return ; + } + + Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end) + , [&]( int i ) { parfor_result[i] = i ; } + ); + + // test parallel_reduce without join + + long tot = 0; + long expected = (begin+end-1)*(end-begin)*0.5; + + Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member,begin,end) + , [&]( int i, long &res) { res += parfor_result[i]; } + , tot); + Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end) + , [&]( int i ) { parreduce_check[i] = expected-tot ; } + ); + + // test parallel_reduce with join + + tot = 0; + Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member,begin,end) + , [&]( int i, long &res) { res += parfor_result[i]; } + , [&]( long& val1, const long& val2) { val1 += val2; } + , tot); + Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end) + , [&]( int i ) { parreduce_check[i] += expected-tot ; } + ); + +#if 0 + // test parallel_scan + + // Exclusive scan + Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange(member,begin,end) + , [&]( int i, long &val , const bool final ) { + if ( final ) { parscan_result[i] = val; } + val += i; + } + ); + + if ( member.team_rank() == 0 ) { + for ( long i = begin ; i < end ; ++i ) { + parscan_check[i] = (i*(i-1)-begin*(begin-1))*0.5-parscan_result[i]; + } + } + + // Inclusive scan + Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange(member,begin,end) + , [&]( int i, long &val , const bool final ) { + val += i; + if ( final ) { parscan_result[i] = val; } + } + ); + + if ( member.team_rank() == 0 ) { + for ( long i = begin ; i < end ; ++i ) { + parscan_check[i] += (i*(i+1)-begin*(begin-1))*0.5-parscan_result[i]; + } + } +#endif + + } + + static void run( long n ) + { + // const unsigned memory_capacity = 10000 ; // causes memory pool infinite loop + // const unsigned memory_capacity = 100000 ; // fails with SPAN=1 for serial and OMP + const unsigned memory_capacity = 400000 ; + + policy_type root_policy( typename policy_type::memory_space() + , memory_capacity ); + + view_type root_parfor_result("parfor_result",n+1); + view_type root_parreduce_check("parreduce_check",n+1); + view_type root_parscan_result("parscan_result",n+1); + view_type root_parscan_check("parscan_check",n+1); + + typename view_type::HostMirror + host_parfor_result = Kokkos::create_mirror_view( root_parfor_result ); + typename view_type::HostMirror + host_parreduce_check = Kokkos::create_mirror_view( root_parreduce_check ); + typename view_type::HostMirror + host_parscan_result = Kokkos::create_mirror_view( root_parscan_result ); + typename view_type::HostMirror + host_parscan_check = Kokkos::create_mirror_view( root_parscan_check ); + + future_type f = root_policy.host_spawn( + TestTaskTeam( root_policy , + root_parfor_result , + root_parreduce_check , + root_parscan_result, + root_parscan_check, + n ) , + Kokkos::TaskTeam ); + + Kokkos::wait( root_policy ); + + Kokkos::deep_copy( host_parfor_result , root_parfor_result ); + Kokkos::deep_copy( host_parreduce_check , root_parreduce_check ); + Kokkos::deep_copy( host_parscan_result , root_parscan_result ); + Kokkos::deep_copy( host_parscan_check , root_parscan_check ); + + for ( long i = 0 ; i <= n ; ++i ) { + const long answer = i ; + if ( host_parfor_result(i) != answer ) { + std::cerr << "TestTaskTeam::run ERROR parallel_for result(" << i << ") = " + << host_parfor_result(i) << " != " << answer << std::endl ; + } + if ( host_parreduce_check(i) != 0 ) { + std::cerr << "TestTaskTeam::run ERROR parallel_reduce check(" << i << ") = " + << host_parreduce_check(i) << " != 0" << std::endl ; + } //TODO + if ( host_parscan_check(i) != 0 ) { + std::cerr << "TestTaskTeam::run ERROR parallel_scan check(" << i << ") = " + << host_parscan_check(i) << " != 0" << std::endl ; + } + } + } +}; + +template< class ExecSpace > +struct TestTaskTeamValue { + + enum { SPAN = 8 }; + + typedef long value_type ; + typedef Kokkos::TaskPolicy<ExecSpace> policy_type ; + typedef Kokkos::Future<value_type,ExecSpace> future_type ; + typedef Kokkos::View<long*,ExecSpace> view_type ; + + policy_type policy ; + future_type future ; + + view_type result ; + const long nvalue ; + + KOKKOS_INLINE_FUNCTION + TestTaskTeamValue( const policy_type & arg_policy + , const view_type & arg_result + , const long arg_nvalue ) + : policy(arg_policy) + , future() + , result( arg_result ) + , nvalue( arg_nvalue ) + {} + + KOKKOS_INLINE_FUNCTION + void operator()( typename policy_type::member_type const & member + , value_type & final ) + { + const long end = nvalue + 1 ; + const long begin = 0 < end - SPAN ? end - SPAN : 0 ; + + if ( 0 < begin && future.is_null() ) { + if ( member.team_rank() == 0 ) { + + future = policy.task_spawn + ( TestTaskTeamValue( policy , result , begin - 1 ) + , Kokkos::TaskTeam ); + + assert( ! future.is_null() ); + + policy.respawn( this , future ); + } + return ; + } + + Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end) + , [&]( int i ) { result[i] = i + 1 ; } + ); + + if ( member.team_rank() == 0 ) { + final = result[nvalue] ; + } + + Kokkos::memory_fence(); + } + + static void run( long n ) + { + // const unsigned memory_capacity = 10000 ; // causes memory pool infinite loop + const unsigned memory_capacity = 100000 ; + + policy_type root_policy( typename policy_type::memory_space() + , memory_capacity ); + + view_type root_result("result",n+1); + + typename view_type::HostMirror + host_result = Kokkos::create_mirror_view( root_result ); + + future_type fv = root_policy.host_spawn + ( TestTaskTeamValue( root_policy, root_result, n ) , Kokkos::TaskTeam ); + + Kokkos::wait( root_policy ); + + Kokkos::deep_copy( host_result , root_result ); + + if ( fv.get() != n + 1 ) { + std::cerr << "TestTaskTeamValue ERROR future = " + << fv.get() << " != " << n + 1 << std::endl ; + } + for ( long i = 0 ; i <= n ; ++i ) { + const long answer = i + 1 ; + if ( host_result(i) != answer ) { + std::cerr << "TestTaskTeamValue ERROR result(" << i << ") = " + << host_result(i) << " != " << answer << std::endl ; + } + } + } +}; +} // namespace TestTaskPolicy + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace TestTaskPolicy { + template< class ExecSpace > struct FibChild { @@ -207,28 +686,8 @@ struct FibChild2 { } }; -namespace { - -long eval_fib( long n ) -{ - if ( 2 <= n ) { - std::vector<long> fib(n+1); - - fib[0] = 0 ; - fib[1] = 1 ; - - for ( long i = 2 ; i <= n ; ++i ) { fib[i] = fib[i-2] + fib[i-1]; } - - n = fib[n] ; - } - - return n ; -} - -} - template< class ExecSpace > -void test_fib( long n , const unsigned task_max_count = 1024 ) +void test_fib( long n , const unsigned task_max_count = 4096 ) { const unsigned task_max_size = 256 ; const unsigned task_dependence = 4 ; @@ -654,9 +1113,15 @@ void test_latch( int n ) typedef TaskLatchRun< ExecSpace > task_type ; typedef typename task_type::policy_type policy_type ; - // Primary + latch + n*LatchAdd - const unsigned task_max_count = n + 2 ; - const unsigned task_max_size = sizeof(task_type); + // Primary + latch + n * LatchAdd + // + // This test uses several two different block sizes for allocation from the + // memory pool, so the memory size requested must be big enough to cause two + // or more superblocks to be used. Currently, the superblock size in the + // task policy is 2^16, so make the minimum requested memory size greater + // than this. + const unsigned task_max_count = n + 2 < 256 ? 256 : n + 2; + const unsigned task_max_size = 256; const unsigned task_dependence = 4 ; policy_type @@ -664,16 +1129,17 @@ void test_latch( int n ) , task_max_size , task_dependence ); - policy.spawn( policy.proc_create( TaskLatchRun<ExecSpace>(policy,n) ) ); wait( policy ); } +//---------------------------------------------------------------------------- //---------------------------------------------------------------------------- } // namespace TestTaskPolicy +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ #endif /* #ifndef KOKKOS_UNITTEST_TASKPOLICY_HPP */ diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp index 810e74abdc..db6b0cff7e 100644 --- a/lib/kokkos/core/unit_test/TestTeam.hpp +++ b/lib/kokkos/core/unit_test/TestTeam.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -89,6 +89,34 @@ struct TestTeamPolicy { } } + // included for test_small_league_size + TestTeamPolicy() + : m_flags() + {} + + // included for test_small_league_size + struct NoOpTag {} ; + KOKKOS_INLINE_FUNCTION + void operator()( const NoOpTag & , const team_member & member ) const + {} + + + static void test_small_league_size() { + + int bs = 8; // batch size (number of elements per batch) + int ns = 16; // total number of "problems" to process + + // calculate total scratch memory space size + const int level = 0; + int mem_size = 960; + const int num_teams = ns/bs; + const Kokkos::TeamPolicy< ExecSpace, NoOpTag > policy(num_teams, Kokkos::AUTO()); + + Kokkos::parallel_for ( policy.set_scratch_size(level, Kokkos::PerTeam(mem_size), Kokkos::PerThread(0)) + , TestTeamPolicy() + ); + } + static void test_for( const size_t league_size ) { TestTeamPolicy functor( league_size ); @@ -97,6 +125,8 @@ struct TestTeamPolicy { Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType, ExecSpace >( league_size , team_size ) , functor ); Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType, ExecSpace , VerifyInitTag >( league_size , team_size ) , functor ); + + test_small_league_size(); } struct ReduceTag {}; @@ -617,7 +647,7 @@ struct TestScratchTeam { int team_scratch_size = Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) + Functor::shared_int_array_type::shmem_size(2*team_size); int thread_scratch_size = Functor::shared_int_array_type::shmem_size(Functor::SHARED_THREAD_COUNT); - Kokkos::parallel_reduce( team_exec.set_scratch_size(1,Kokkos::PerTeam(team_scratch_size), + Kokkos::parallel_reduce( team_exec.set_scratch_size(0,Kokkos::PerTeam(team_scratch_size), Kokkos::PerThread(thread_scratch_size)) , Functor() , result_type( & error_count ) ); @@ -626,4 +656,255 @@ struct TestScratchTeam { }; } +namespace Test { +template< class ExecSpace> +KOKKOS_INLINE_FUNCTION +int test_team_mulit_level_scratch_loop_body(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) { + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team1(team.team_scratch(0),128); + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread1(team.thread_scratch(0),16); + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team2(team.team_scratch(0),128); + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread2(team.thread_scratch(0),16); + + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team1(team.team_scratch(1),128000); + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread1(team.thread_scratch(1),16000); + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team2(team.team_scratch(1),128000); + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread2(team.thread_scratch(1),16000); + + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team3(team.team_scratch(0),128); + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread3(team.thread_scratch(0),16); + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team3(team.team_scratch(1),128000); + Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread3(team.thread_scratch(1),16000); + + + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128), [&] (const int& i) { + a_team1(i) = 1000000 + i; + a_team2(i) = 2000000 + i; + a_team3(i) = 3000000 + i; + }); + team.team_barrier(); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i){ + a_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i; + a_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i; + a_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i; + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i) { + b_team1(i) = 1000000 + i; + b_team2(i) = 2000000 + i; + b_team3(i) = 3000000 + i; + }); + team.team_barrier(); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i){ + b_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i; + b_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i; + b_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i; + }); + + team.team_barrier(); + int error = 0; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128), [&] (const int& i) { + if(a_team1(i) != 1000000 + i) error++; + if(a_team2(i) != 2000000 + i) error++; + if(a_team3(i) != 3000000 + i) error++; + }); + team.team_barrier(); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i){ + if(a_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++; + if(a_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++; + if(a_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++; + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i) { + if(b_team1(i) != 1000000 + i) error++; + if(b_team2(i) != 2000000 + i) error++; + if(b_team3(i) != 3000000 + i) error++; + }); + team.team_barrier(); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i){ + if(b_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++; + if(b_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++; + if( b_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++; + }); + + return error; +} + + +struct TagReduce {}; +struct TagFor {}; + +template< class ExecSpace, class ScheduleType > +struct ClassNoShmemSizeFunction { + Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors; + + KOKKOS_INLINE_FUNCTION + void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const { + int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + errors() += error; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const { + error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + } + + void run() { + Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors"); + errors = d_errors; + + const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128); + const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16); + + const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000); + const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000); + { + Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16); + Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)), + *this); + Kokkos::fence(); + typename Kokkos::View<int,ExecSpace>::HostMirror h_errors = Kokkos::create_mirror_view(d_errors); + Kokkos::deep_copy(h_errors,d_errors); + ASSERT_EQ(h_errors(),0); + } + + { + int error = 0; + Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16); + Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)), + *this,error); + Kokkos::fence(); + ASSERT_EQ(error,0); + } + }; +}; + +template< class ExecSpace, class ScheduleType > +struct ClassWithShmemSizeFunction { + Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors; + + KOKKOS_INLINE_FUNCTION + void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const { + int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + errors() += error; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const { + error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + } + + void run() { + Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors"); + errors = d_errors; + + const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000); + const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000); + { + Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16); + Kokkos::parallel_for(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)), + *this); + Kokkos::fence(); + typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(d_errors); + Kokkos::deep_copy(h_errors,d_errors); + ASSERT_EQ(h_errors(),0); + } + + { + int error = 0; + Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16); + Kokkos::parallel_reduce(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)), + *this,error); + Kokkos::fence(); + ASSERT_EQ(error,0); + } + }; + + unsigned team_shmem_size(int team_size) const { + const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128); + const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16); + return per_team0 + team_size * per_thread0; + } +}; + +template< class ExecSpace, class ScheduleType > +void test_team_mulit_level_scratch_test_lambda() { +#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA + Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors; + Kokkos::View<int,ExecSpace> d_errors("Errors"); + errors = d_errors; + + const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128); + const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16); + + const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000); + const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000); + + Kokkos::TeamPolicy<ExecSpace,ScheduleType> policy(10,8,16); + Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)), + KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) { + int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + errors() += error; + }); + Kokkos::fence(); + typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(errors); + Kokkos::deep_copy(h_errors,d_errors); + ASSERT_EQ(h_errors(),0); + + int error = 0; + Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)), + KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team, int& count) { + count += test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + },error); + ASSERT_EQ(error,0); + Kokkos::fence(); +#endif +} + + +} + +namespace { +template< class ExecSpace, class ScheduleType > +struct TestMultiLevelScratchTeam { + + TestMultiLevelScratchTeam() + { run(); } + + void run() + { +#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA + Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>(); +#endif + Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1; + c1.run(); + + Test::ClassWithShmemSizeFunction<ExecSpace, ScheduleType> c2; + c2.run(); + + } +}; +} + +namespace Test { + +template< class ExecSpace > +struct TestShmemSize { + + TestShmemSize() { run(); } + + void run() + { + typedef Kokkos::View< long***, ExecSpace > view_type; + + size_t d1 = 5; + size_t d2 = 6; + size_t d3 = 7; + + size_t size = view_type::shmem_size( d1, d2, d3 ); + + ASSERT_EQ( size, d1 * d2 * d3 * sizeof(long) ); + } +}; +} + /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/unit_test/TestThreads.cpp b/lib/kokkos/core/unit_test/TestThreads.cpp index 03c7c44958..93049b95dd 100644 --- a/lib/kokkos/core/unit_test/TestThreads.cpp +++ b/lib/kokkos/core/unit_test/TestThreads.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -66,6 +66,7 @@ #include <TestViewSubview.hpp> #include <TestViewOfClass.hpp> #include <TestAtomic.hpp> +#include <TestAtomicOperations.hpp> #include <TestReduce.hpp> #include <TestScan.hpp> @@ -87,6 +88,8 @@ #include <TestPolicyConstruction.hpp> +#include <TestMDRange.hpp> + namespace Test { class threads : public ::testing::Test { @@ -112,7 +115,6 @@ protected: Kokkos::Threads::initialize( threads_count ); Kokkos::Threads::finalize(); - threads_count = std::max( 1u , numa_count * 2 ) * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 ); @@ -143,6 +145,12 @@ TEST_F( threads , init ) { ; } +TEST_F( threads , md_range ) { + TestMDRange_2D< Kokkos::Threads >::test_for2(100,100); + + TestMDRange_3D< Kokkos::Threads >::test_for3(100,100,100); +} + TEST_F( threads , dispatch ) { const int repeat = 100 ; @@ -235,6 +243,13 @@ TEST_F( threads, view_aggregate ) { TEST_F( threads , range_tag ) { + TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(2); + TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2); + TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(2); + TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3); + TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3); + TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3); + TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(2); TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(1000); TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000); @@ -246,6 +261,10 @@ TEST_F( threads , range_tag ) TEST_F( threads , team_tag ) { + TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(2); + TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2); + TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2); + TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2); TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(1000); TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000); @@ -260,6 +279,14 @@ TEST_F( threads, double_reduce) { TestReduce< double , Kokkos::Threads >( 1000000 ); } +TEST_F( threads , reducers ) +{ + TestReducers<int, Kokkos::Threads>::execute_integer(); + TestReducers<size_t, Kokkos::Threads>::execute_integer(); + TestReducers<double, Kokkos::Threads>::execute_float(); + TestReducers<Kokkos::complex<double>, Kokkos::Threads>::execute_basic(); +} + TEST_F( threads, team_long_reduce) { TestReduceTeam< long , Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 3 ); TestReduceTeam< long , Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 3 ); @@ -291,13 +318,17 @@ TEST_F( threads, team_shared_request) { TestSharedTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >(); } -#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) +#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) TEST_F( threads, team_lambda_shared_request) { TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >(); TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >(); } #endif +TEST_F( threads, shmem_size) { + TestShmemSize< Kokkos::Threads >(); +} + TEST_F( threads , view_remap ) { enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 }; @@ -382,6 +413,75 @@ TEST_F( threads , atomics ) ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<3>, Kokkos::Threads>(loop_count,3) ) ); } +TEST_F( threads , atomic_operations ) +{ + const int start = 1; //Avoid zero for division + const int end = 11; + for (int i = start; i < end; ++i) + { + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 4 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 5 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 6 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 7 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 8 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 9 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 4 ) ) ); + + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 1 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 2 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 3 ) ) ); + ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 4 ) ) ); + } + +} + //---------------------------------------------------------------------------- #if 0 @@ -434,7 +534,9 @@ TEST_F( threads , memory_pool ) bool val = TestMemoryPool::test_mempool< Kokkos::Threads >( 128, 128000000 ); ASSERT_TRUE( val ); - TestMemoryPool::test_mempool2< Kokkos::Threads >( 128, 128000000 ); + TestMemoryPool::test_mempool2< Kokkos::Threads >( 64, 4, 1000000, 2000000 ); + + TestMemoryPool::test_memory_exhaustion< Kokkos::Threads >(); } //---------------------------------------------------------------------------- @@ -478,6 +580,8 @@ TEST_F( threads , team_vector ) ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(10) ) ); } +#if defined( KOKKOS_ENABLE_TASKPOLICY ) + TEST_F( threads , task_policy ) { TestTaskPolicy::test_task_dep< Kokkos::Threads >( 10 ); @@ -503,6 +607,8 @@ TEST_F( threads , task_latch ) TestTaskPolicy::test_latch< Kokkos::Threads >(1000); } +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ + } // namespace Test #endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */ diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp index 60c87472df..ae4c6d2185 100644 --- a/lib/kokkos/core/unit_test/TestViewAPI.hpp +++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp @@ -63,7 +63,9 @@ size_t allocation_count( const Kokkos::View<T,P...> & view ) const size_t card = view.size(); const size_t alloc = view.span(); - return card <= alloc ? alloc : 0 ; + const int memory_span = Kokkos::View<int*>::required_allocation_size(100); + + return (card <= alloc && memory_span == 400) ? alloc : 0 ; } #else diff --git a/lib/kokkos/example/fenl/CGSolve.hpp b/lib/kokkos/example/fenl/CGSolve.hpp index 370dee15ac..06a0030e09 100644 --- a/lib/kokkos/example/fenl/CGSolve.hpp +++ b/lib/kokkos/example/fenl/CGSolve.hpp @@ -245,8 +245,8 @@ void cgsolve( const ImportType & import norm_res = sqrt( old_rdot ); iteration = 0 ; - Kokkos::Impl::Timer wall_clock ; - Kokkos::Impl::Timer timer; + Kokkos::Timer wall_clock ; + Kokkos::Timer timer; while ( tolerance < norm_res && iteration < maximum_iteration ) { diff --git a/lib/kokkos/example/fenl/fenl_functors.hpp b/lib/kokkos/example/fenl/fenl_functors.hpp index 30f5274a51..3020c99a2f 100644 --- a/lib/kokkos/example/fenl/fenl_functors.hpp +++ b/lib/kokkos/example/fenl/fenl_functors.hpp @@ -138,7 +138,7 @@ public: //-------------------------------- // Guess at capacity required for the map: - Kokkos::Impl::Timer wall_clock ; + Kokkos::Timer wall_clock ; wall_clock.reset(); phase = FILL_NODE_SET ; diff --git a/lib/kokkos/example/fenl/fenl_impl.hpp b/lib/kokkos/example/fenl/fenl_impl.hpp index 9c57da2989..64070ce55f 100644 --- a/lib/kokkos/example/fenl/fenl_impl.hpp +++ b/lib/kokkos/example/fenl/fenl_impl.hpp @@ -312,7 +312,7 @@ Perf fenl( //------------------------------------ - Kokkos::Impl::Timer wall_clock ; + Kokkos::Timer wall_clock ; Perf perf_stats = Perf() ; diff --git a/lib/kokkos/example/global_2_local_ids/G2L.hpp b/lib/kokkos/example/global_2_local_ids/G2L.hpp index d4198c61ac..9023ae0426 100644 --- a/lib/kokkos/example/global_2_local_ids/G2L.hpp +++ b/lib/kokkos/example/global_2_local_ids/G2L.hpp @@ -186,7 +186,7 @@ size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned nu typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view; double elasped_time = 0; - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; local_id_view local_2_global("local_ids", num_ids); global_id_view global_2_local(capacity); diff --git a/lib/kokkos/example/ichol/example/example_chol_performance_device.hpp b/lib/kokkos/example/ichol/example/example_chol_performance_device.hpp index 837c74038c..ca819e4f97 100644 --- a/lib/kokkos/example/ichol/example/example_chol_performance_device.hpp +++ b/lib/kokkos/example/ichol/example/example_chol_performance_device.hpp @@ -76,7 +76,7 @@ namespace Tacho { int r_val = 0; - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; double t_import = 0.0, t_reorder = 0.0, diff --git a/lib/kokkos/example/md_skeleton/main.cpp b/lib/kokkos/example/md_skeleton/main.cpp index 06287bc609..58cf76cab0 100644 --- a/lib/kokkos/example/md_skeleton/main.cpp +++ b/lib/kokkos/example/md_skeleton/main.cpp @@ -76,7 +76,7 @@ int main(int argc, char** argv) { int iter = 100; /* Default value for system size (4*nx*ny*nz atoms) - * nx, ny and nz are set to system_size if not specififed on commandline */ + * nx, ny and nz are set to system_size if not specified on commandline */ int system_size = 20; int nx = -1; @@ -191,7 +191,7 @@ int main(int argc, char** argv) { printf("-> Running %i force calculations\n",iter); - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; for(int i=0;i<iter;i++) { force(system,0); diff --git a/lib/kokkos/example/multi_fem/Explicit.hpp b/lib/kokkos/example/multi_fem/Explicit.hpp index ddeb53ae61..cef1a37a1a 100644 --- a/lib/kokkos/example/multi_fem/Explicit.hpp +++ b/lib/kokkos/example/multi_fem/Explicit.hpp @@ -127,7 +127,7 @@ PerformanceData run( const typename FixtureType::FEMeshType & mesh , PerformanceData perf_data ; - Kokkos::Impl::Timer wall_clock ; + Kokkos::Timer wall_clock ; //------------------------------------ // Generate fields diff --git a/lib/kokkos/example/multi_fem/Implicit.hpp b/lib/kokkos/example/multi_fem/Implicit.hpp index 0017cb8e88..53f602f11a 100644 --- a/lib/kokkos/example/multi_fem/Implicit.hpp +++ b/lib/kokkos/example/multi_fem/Implicit.hpp @@ -154,7 +154,7 @@ PerformanceData run( const typename FixtureType::FEMeshType & mesh , typename graph_factory::element_map_type element_map ; - Kokkos::Impl::Timer wall_clock ; + Kokkos::Timer wall_clock ; //------------------------------------ // Generate sparse matrix graph and element->graph map. diff --git a/lib/kokkos/example/multi_fem/Nonlinear.hpp b/lib/kokkos/example/multi_fem/Nonlinear.hpp index 96a05b97a9..1d243395c2 100644 --- a/lib/kokkos/example/multi_fem/Nonlinear.hpp +++ b/lib/kokkos/example/multi_fem/Nonlinear.hpp @@ -243,7 +243,7 @@ PerformanceData run( const typename FixtureType::FEMeshType & mesh , //------------------------------------ // Generate mesh and corresponding sparse matrix graph - Kokkos::Impl::Timer wall_clock ; + Kokkos::Timer wall_clock ; //------------------------------------ // Generate sparse matrix graph and element->graph map. diff --git a/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp b/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp index 6ab42da50c..8d140b6d25 100644 --- a/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp +++ b/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp @@ -243,7 +243,7 @@ void cgsolve( normr = sqrt( old_rdot ); iteration = 0 ; - Kokkos::Impl::Timer wall_clock ; + Kokkos::Timer wall_clock ; while ( tolerance < normr && iteration < maximum_iteration ) { diff --git a/lib/kokkos/example/sort_array/CMakeLists.txt b/lib/kokkos/example/sort_array/CMakeLists.txt index 3e58198d7b..0c7da74f4a 100644 --- a/lib/kokkos/example/sort_array/CMakeLists.txt +++ b/lib/kokkos/example/sort_array/CMakeLists.txt @@ -1,4 +1,3 @@ -INCLUDE(TribitsAddExecutableAndTest) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/lib/kokkos/example/sort_array/sort_array.hpp b/lib/kokkos/example/sort_array/sort_array.hpp index 018b1ee8e8..d21f998958 100644 --- a/lib/kokkos/example/sort_array/sort_array.hpp +++ b/lib/kokkos/example/sort_array/sort_array.hpp @@ -116,7 +116,7 @@ void sort_array( const size_t array_length /* length of spans of array to sort * #endif - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; const device_array_type work_array("work_array" , array_length ); const host_array_type host_array("host_array" , total_length ); diff --git a/lib/kokkos/example/tutorial/01_hello_world/Makefile b/lib/kokkos/example/tutorial/01_hello_world/Makefile index 38fb1b8f86..78a9fed0cc 100644 --- a/lib/kokkos/example/tutorial/01_hello_world/Makefile +++ b/lib/kokkos/example/tutorial/01_hello_world/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile b/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile index bd2371382a..95ee2c47fe 100644 --- a/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile +++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/02_simple_reduce/Makefile b/lib/kokkos/example/tutorial/02_simple_reduce/Makefile index 38fb1b8f86..78a9fed0cc 100644 --- a/lib/kokkos/example/tutorial/02_simple_reduce/Makefile +++ b/lib/kokkos/example/tutorial/02_simple_reduce/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile index bd2371382a..95ee2c47fe 100644 --- a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile +++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/03_simple_view/Makefile b/lib/kokkos/example/tutorial/03_simple_view/Makefile index 38fb1b8f86..78a9fed0cc 100644 --- a/lib/kokkos/example/tutorial/03_simple_view/Makefile +++ b/lib/kokkos/example/tutorial/03_simple_view/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile b/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile index bd2371382a..95ee2c47fe 100644 --- a/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile +++ b/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile b/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile index 38fb1b8f86..78a9fed0cc 100644 --- a/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile +++ b/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/05_simple_atomics/Makefile b/lib/kokkos/example/tutorial/05_simple_atomics/Makefile index 38fb1b8f86..78a9fed0cc 100644 --- a/lib/kokkos/example/tutorial/05_simple_atomics/Makefile +++ b/lib/kokkos/example/tutorial/05_simple_atomics/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile index 3d056537c3..12ad36b31e 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp index e61e8af59b..8406c504c9 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp +++ b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp @@ -142,7 +142,7 @@ int main (int narg, char* arg[]) { // Measure time to execute the contraction kernel when giving it a // LayoutLeft view for v1 and a LayoutRight view for v2. This should be // fast on GPUs and slow on CPUs - Kokkos::Impl::Timer time1; + Kokkos::Timer time1; Kokkos::parallel_for(size,contraction<left_type,right_type>(a,l,r)); Kokkos::fence(); double sec1 = time1.seconds(); @@ -154,7 +154,7 @@ int main (int narg, char* arg[]) { // Measure time to execute the contraction kernel when giving it a // LayoutRight view for v1 and a LayoutLeft view for v2. This should be // fast on CPUs and slow on GPUs - Kokkos::Impl::Timer time2; + Kokkos::Timer time2; Kokkos::parallel_for(size,contraction<right_type,left_type>(a,r,l)); Kokkos::fence(); double sec2 = time2.seconds(); diff --git a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile index 3d056537c3..12ad36b31e 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp index 8317c78bc9..ddd28a97c3 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp +++ b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp @@ -124,12 +124,12 @@ int main(int narg, char* arg[]) { // Run the localsum functor using the RandomAccess trait. On CPUs there should // not be any different in performance to not using the RandomAccess trait. // On GPUs where can be a dramatic difference - Kokkos::Impl::Timer time1; + Kokkos::Timer time1; Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src)); Kokkos::fence(); double sec1 = time1.seconds(); - Kokkos::Impl::Timer time2; + Kokkos::Timer time2; Kokkos::parallel_for(size,localsum<view_type,view_type>(idx,dest,src)); Kokkos::fence(); double sec2 = time2.seconds(); diff --git a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile index 3d056537c3..12ad36b31e 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile index 3d056537c3..12ad36b31e 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp index 62ddb9c18a..4905e4bf88 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp +++ b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp @@ -87,9 +87,9 @@ struct localsum { // For example, the const_data_type version of double** is const // double**. Kokkos::View<idx_type::const_data_type, idx_type::array_layout, memory_space> idx; - // "array_intrinsic_type" is a typedef in ViewTraits (and DualView) which is the + // "scalar_array_type" is a typedef in ViewTraits (and DualView) which is the // array version of the value(s) stored in the View. - Kokkos::View<view_type::array_intrinsic_type, view_type::array_layout, memory_space> dest; + Kokkos::View<view_type::scalar_array_type, view_type::array_layout, memory_space> dest; Kokkos::View<view_type::const_data_type, view_type::array_layout, memory_space, Kokkos::MemoryRandomAccess> src; @@ -150,6 +150,9 @@ protected: int main (int narg, char* arg[]) { Kokkos::initialize (narg, arg); +// If View is non-trivial constructible type then add braces so it is out of scope +// before Kokkos::finalize() call +{ ParticleTypes test("Test"); Kokkos::fence(); test.h_view(0) = ParticleType(-1e4,1); @@ -182,7 +185,7 @@ int main (int narg, char* arg[]) { // Run on the device. This will cause a sync of idx to the device, // since it was marked as modified on the host. - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src)); Kokkos::fence(); double sec1_dev = timer.seconds(); @@ -208,6 +211,7 @@ int main (int narg, char* arg[]) { printf("Device Time with Sync: %f without Sync: %f \n",sec1_dev,sec2_dev); printf("Host Time with Sync: %f without Sync: %f \n",sec1_host,sec2_host); +} Kokkos::finalize(); } diff --git a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile index 3d056537c3..12ad36b31e 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp index a7d460a1cc..cf5326b687 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp +++ b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp @@ -97,7 +97,7 @@ int main(int narg, char* arg[]) { Kokkos::fence(); // Run on the device // This will cause a sync of idx to the device since it was modified on the host - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src)); Kokkos::fence(); double sec1_dev = timer.seconds(); diff --git a/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile index 3d056537c3..12ad36b31e 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile index 06955b3641..60a514f4d5 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 --default-stream per-thread LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp index 8c7e26c850..5da3bf76c9 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp +++ b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp @@ -116,7 +116,7 @@ int main(int argc, char * argv[]) { Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),FillDevice(0.0,d_a)); Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),FillDevice(1.3513,d_b)); Kokkos::fence(); - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),ComputeADevice(20,d_a,d_b)); if(synch==1) diff --git a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile index 3d056537c3..12ad36b31e 100644 --- a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile +++ b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp index 52816333c2..3e6175a756 100644 --- a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp +++ b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp @@ -122,7 +122,7 @@ int main(int argc, char* args[]) { Kokkos::DualView<uint64_t*> vals("Vals",size*samples); // Run some performance comparisons - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples)); Kokkos::fence(); diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile index 3d056537c3..12ad36b31e 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile index 3d056537c3..965b72b4e9 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile @@ -5,13 +5,14 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = EXE = $(SRC:.cpp=.cuda) KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" +KOKKOS_CUDA_OPTIONS = "enable_lambda" else CXX = g++ CXXFLAGS = -O3 diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp index 5d081bf62b..565dd22e82 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp @@ -62,7 +62,8 @@ int main (int narg, char* args[]) { // Set up a policy that launches 12 teams, with the maximum number // of threads per team. - const team_policy policy (12, team_policy::team_size_max ( [=]{} )); + + const team_policy policy (12, Kokkos::AUTO); // This is a reduction with a team policy. The team policy changes // the first argument of the lambda. Rather than an integer index diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile index 3d056537c3..12ad36b31e 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile index 3d056537c3..12ad36b31e 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp index 0eac4309a9..99d5958edf 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp @@ -141,11 +141,11 @@ int main(int narg, char* args[]) { // Each team handles a slice of the data // Set up TeamPolicy with 512 teams with maximum number of threads per team and 16 vector lanes. - // The team_size_max function will determine the maximum number of threads taking into account - // shared memory requirements of the Functor. + // Kokkos::AUTO will determine the number of threads // The maximum vector length is hardware dependent but can always be smaller than the hardware allows. // The vector length must be a power of 2. - const Kokkos::TeamPolicy<> policy( 512 , Kokkos::TeamPolicy<>::team_size_max(SomeCorrelation(data,gsum)) , 16); + + const Kokkos::TeamPolicy<> policy( 512 , Kokkos::AUTO , 16); Kokkos::parallel_for( policy , SomeCorrelation(data,gsum) ); diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile index 3d056537c3..12ad36b31e 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile @@ -5,7 +5,7 @@ default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) -CXX = nvcc_wrapper +CXX = ../../../../config/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} LINKFLAGS = diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp index a9b20da1ae..c12b11d04d 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp @@ -117,7 +117,7 @@ int main(int narg, char* args[]) { Kokkos::DualView<int**> histogram("histogram",TEAM_SIZE,TEAM_SIZE); - Kokkos::Impl::Timer timer; + Kokkos::Timer timer; // threads/team is automatically limited to maximum supported by the device. Kokkos::parallel_for( team_policy( nchunks , TEAM_SIZE ) , find_2_tuples(chunk_size,data,histogram) ); diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash index 016e5dfe35..86f136da96 100755 --- a/lib/kokkos/generate_makefile.bash +++ b/lib/kokkos/generate_makefile.bash @@ -107,7 +107,7 @@ case $key in exit 0 ;; *) - # unknown option + echo "warning: ignoring unknown option $key" ;; esac shift diff --git a/src/MANYBODY/pair_vashishta.cpp b/src/MANYBODY/pair_vashishta.cpp index aa030540f4..19f6017907 100755 --- a/src/MANYBODY/pair_vashishta.cpp +++ b/src/MANYBODY/pair_vashishta.cpp @@ -541,7 +541,7 @@ void PairVashishta::setup_params() /* ---------------------------------------------------------------------- */ void PairVashishta::twobody(Param *param, double rsq, double &fforce, - int eflag, double &eng) + int eflag, double &eng) { double r,rinvsq,r4inv,r6inv,reta,lam1r,lam4r,vc2,vc3; diff --git a/src/MANYBODY/pair_vashishta.h b/src/MANYBODY/pair_vashishta.h index cdd2da3471..87077011e6 100755 --- a/src/MANYBODY/pair_vashishta.h +++ b/src/MANYBODY/pair_vashishta.h @@ -17,8 +17,8 @@ PairStyle(vashishta,PairVashishta) #else -#ifndef LMP_PAIR_Vashishta_H -#define LMP_PAIR_Vashishta_H +#ifndef LMP_PAIR_VASHISHITA_H +#define LMP_PAIR_VASHISHITA_H #include "pair.h" @@ -29,10 +29,10 @@ class PairVashishta : public Pair { PairVashishta(class LAMMPS *); virtual ~PairVashishta(); virtual void compute(int, int); - void settings(int, char **); + virtual void settings(int, char **); void coeff(int, char **); - virtual double init_one(int, int); - virtual void init_style(); + double init_one(int, int); + void init_style(); protected: struct Param { @@ -55,9 +55,9 @@ class PairVashishta : public Pair { int maxparam; // max # of parameter sets Param *params; // parameter set for an I-J-K interaction - virtual void allocate(); + void allocate(); void read_file(char *); - void setup_params(); + virtual void setup_params(); void twobody(Param *, double, double &, int, double &); void threebody(Param *, Param *, Param *, double, double, double *, double *, double *, double *, int, double &); diff --git a/src/fix_nve_sphere.cpp b/src/fix_nve_sphere.cpp index 9f7b4a9eaa..42cca31f6b 100644 --- a/src/fix_nve_sphere.cpp +++ b/src/fix_nve_sphere.cpp @@ -133,12 +133,13 @@ void FixNVESphere::initial_integrate(int vflag) // update mu for dipoles - if (extra == DIPOLE) { double **mu = atom->mu; if (dlm == NODLM) { + // d_mu/dt = omega cross mu // renormalize mu to dipole length + for (int i = 0; i < nlocal; i++) if (mask[i] & groupbit) if (mu[i][3] > 0.0) { @@ -152,7 +153,9 @@ void FixNVESphere::initial_integrate(int vflag) mu[i][2] = g[2]*scale; } } else { - // Integrate orientation following Dullweber-Leimkuhler-Maclachlan scheme + + // integrate orientation following Dullweber-Leimkuhler-Maclachlan scheme + for (int i = 0; i < nlocal; i++) { if (mask[i] & groupbit && mu[i][3] > 0.0) { @@ -160,8 +163,9 @@ void FixNVESphere::initial_integrate(int vflag) // Q is the rotation matrix from space frame to body frame // i.e. v_b = Q.v_s - // Define mu to lie along the z axis in the body frame - // We take the unit dipole to avoid getting a scaling matrix + // define mu to lie along the z axis in the body frame + // take the unit dipole to avoid getting a scaling matrix + inv_len_mu = 1.0/mu[i][3]; a[0] = mu[i][0]*inv_len_mu; a[1] = mu[i][1]*inv_len_mu; @@ -180,9 +184,15 @@ void FixNVESphere::initial_integrate(int vflag) if (s2 != 0.0){ // i.e. the vectors are not parallel scale = (1.0 - a[2])/s2; - Q[0][0] = 1.0 - scale*a[0]*a[0]; Q[0][1] = -scale*a[0]*a[1]; Q[0][2] = -a[0]; - Q[1][0] = -scale*a[0]*a[1]; Q[1][1] = 1.0 - scale*a[1]*a[1]; Q[1][2] = -a[1]; - Q[2][0] = a[0]; Q[2][1] = a[1]; Q[2][2] = 1.0 - scale*(a[0]*a[0] + a[1]*a[1]); + Q[0][0] = 1.0 - scale*a[0]*a[0]; + Q[0][1] = -scale*a[0]*a[1]; + Q[0][2] = -a[0]; + Q[1][0] = -scale*a[0]*a[1]; + Q[1][1] = 1.0 - scale*a[1]*a[1]; + Q[1][2] = -a[1]; + Q[2][0] = a[0]; + Q[2][1] = a[1]; + Q[2][2] = 1.0 - scale*(a[0]*a[0] + a[1]*a[1]); } else { // if parallel then we just have I or -I Q[0][0] = 1.0/a[2]; Q[0][1] = 0.0; Q[0][2] = 0.0; Q[1][0] = 0.0; Q[1][1] = 1.0/a[2]; Q[1][2] = 0.0; @@ -242,7 +252,9 @@ void FixNVESphere::initial_integrate(int vflag) // Transform w back into space frame w_temp = Q^T.w transpose_matvec(Q_temp,w,w_temp); - omega[i][0] = w_temp[0]; omega[i][1] = w_temp[1]; omega[i][2] = w_temp[2]; + omega[i][0] = w_temp[0]; + omega[i][1] = w_temp[1]; + omega[i][2] = w_temp[2]; // Set dipole according to updated Q: mu = Q^T.[0 0 1] * |mu| mu[i][0] = Q_temp[2][0] * mu[i][3]; @@ -289,7 +301,8 @@ void FixNVESphere::final_integrate() omega[i][0] += dtirotate * torque[i][0]; omega[i][1] += dtirotate * torque[i][1]; omega[i][2] += dtirotate * torque[i][2]; - rke += (omega[i][0]*omega[i][0] + omega[i][1]*omega[i][1] + omega[i][2]*omega[i][2])*radius[i]*radius[i]*rmass[i]; + rke += (omega[i][0]*omega[i][0] + omega[i][1]*omega[i][1] + + omega[i][2]*omega[i][2])*radius[i]*radius[i]*rmass[i]; } } diff --git a/src/group.cpp b/src/group.cpp index da0e94fc11..973fcbdcce 100644 --- a/src/group.cpp +++ b/src/group.cpp @@ -1665,46 +1665,93 @@ void Group::inertia(int igroup, double *cm, double itensor[3][3], int iregion) /* ---------------------------------------------------------------------- compute angular velocity omega from L and I - really not a group/region operation, but L,I were computed for a group/region - diagonalize I instead of inverting it, to allow for a singular matrix ------------------------------------------------------------------------- */ void Group::omega(double *angmom, double inertia[3][3], double *w) { double idiag[3],ex[3],ey[3],ez[3],cross[3]; - double evectors[3][3]; - - int ierror = MathExtra::jacobi(inertia,idiag,evectors); - if (ierror) error->all(FLERR, - "Insufficient Jacobi rotations for group::omega"); - - ex[0] = evectors[0][0]; - ex[1] = evectors[1][0]; - ex[2] = evectors[2][0]; - ey[0] = evectors[0][1]; - ey[1] = evectors[1][1]; - ey[2] = evectors[2][1]; - ez[0] = evectors[0][2]; - ez[1] = evectors[1][2]; - ez[2] = evectors[2][2]; - - // enforce 3 evectors as a right-handed coordinate system - // flip 3rd vector if needed - - MathExtra::cross3(ex,ey,cross); - if (MathExtra::dot3(cross,ez) < 0.0) MathExtra::negate3(ez); - - // if any principal moment < scaled EPSILON, set to 0.0 + double evectors[3][3],inverse[3][3]; + + // determinant = triple product of rows of inertia matrix + + double determinant = inertia[0][0]*inertia[1][1]*inertia[2][2] + + inertia[0][1]*inertia[1][2]*inertia[2][0] + + inertia[0][2]*inertia[1][0]*inertia[2][1] - + inertia[0][0]*inertia[1][2]*inertia[2][1] - + inertia[0][1]*inertia[1][0]*inertia[2][2] - + inertia[2][0]*inertia[1][1]*inertia[0][2]; + + // non-singular I matrix + // use L = Iw, inverting I to solve for w + // this should give exact zeroing of angular momentum by velocity command + + if (determinant > EPSILON) { + + inverse[0][0] = inertia[1][1]*inertia[2][2] - inertia[1][2]*inertia[2][1]; + inverse[0][1] = -(inertia[0][1]*inertia[2][2] - + inertia[0][2]*inertia[2][1]); + inverse[0][2] = inertia[0][1]*inertia[1][2] - inertia[0][2]*inertia[1][1]; + + inverse[1][0] = -(inertia[1][0]*inertia[2][2] - + inertia[1][2]*inertia[2][0]); + inverse[1][1] = inertia[0][0]*inertia[2][2] - inertia[0][2]*inertia[2][0]; + inverse[1][2] = -(inertia[0][0]*inertia[1][2] - + inertia[0][2]*inertia[1][0]); + + inverse[2][0] = inertia[1][0]*inertia[2][1] - inertia[1][1]*inertia[2][0]; + inverse[2][1] = -(inertia[0][0]*inertia[2][1] - + inertia[0][1]*inertia[2][0]); + inverse[2][2] = inertia[0][0]*inertia[1][1] - inertia[0][1]*inertia[1][0]; + + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) + inverse[i][j] /= determinant; + + w[0] = inverse[0][0]*angmom[0] + inverse[0][1]*angmom[1] + + inverse[0][2]*angmom[2]; + w[1] = inverse[1][0]*angmom[0] + inverse[1][1]*angmom[1] + + inverse[1][2]*angmom[2]; + w[2] = inverse[2][0]*angmom[0] + inverse[2][1]*angmom[1] + + inverse[2][2]*angmom[2]; + + // handle (nearly) singular I matrix + // typically due to 2-atom group or linear molecule + // use jacobi() and angmom_to_omega() to calculate valid omega + // less exact answer than matrix inversion, due to iterative Jacobi method + + } else { + int ierror = MathExtra::jacobi(inertia,idiag,evectors); + if (ierror) error->all(FLERR, + "Insufficient Jacobi rotations for group::omega"); + + ex[0] = evectors[0][0]; + ex[1] = evectors[1][0]; + ex[2] = evectors[2][0]; + ey[0] = evectors[0][1]; + ey[1] = evectors[1][1]; + ey[2] = evectors[2][1]; + ez[0] = evectors[0][2]; + ez[1] = evectors[1][2]; + ez[2] = evectors[2][2]; - double max; - max = MAX(idiag[0],idiag[1]); - max = MAX(max,idiag[2]); + // enforce 3 evectors as a right-handed coordinate system + // flip 3rd vector if needed - if (idiag[0] < EPSILON*max) idiag[0] = 0.0; - if (idiag[1] < EPSILON*max) idiag[1] = 0.0; - if (idiag[2] < EPSILON*max) idiag[2] = 0.0; + MathExtra::cross3(ex,ey,cross); + if (MathExtra::dot3(cross,ez) < 0.0) MathExtra::negate3(ez); - // calculate omega using diagonalized inertia matrix + // if any principal moment < scaled EPSILON, set to 0.0 - MathExtra::angmom_to_omega(angmom,ex,ey,ez,idiag,w); + double max; + max = MAX(idiag[0],idiag[1]); + max = MAX(max,idiag[2]); + + if (idiag[0] < EPSILON*max) idiag[0] = 0.0; + if (idiag[1] < EPSILON*max) idiag[1] = 0.0; + if (idiag[2] < EPSILON*max) idiag[2] = 0.0; + + // calculate omega using diagonalized inertia matrix + + MathExtra::angmom_to_omega(angmom,ex,ey,ez,idiag,w); + } } -- GitLab