diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index acb54ff22fb5383ab5a243c805fe90e56c9129f5..3fe9e46111b9f858d84e4579c8b026ae37dda472 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,46 @@ # Change Log +## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13) + +**Implemented enhancements:** + +- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406) +- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630) +- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898) +- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904) +- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737) +- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890) +- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843) +- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842) +- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870) +- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824) +- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853) +- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852) +- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771) +- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716) +- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668) +- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566) +- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214) + +**Fixed bugs:** + +- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975) +- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941) +- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940) +- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939) +- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917) +- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863) +- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862) +- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860) +- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829) +- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826) +- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776) +- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767) +- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758) +- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670) +- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560) + ## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27) [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 24cd772e008c6dff867317203c0f7e0f57b7256d..d2967cf9a3fe51d5335e50969f31368e71cbad07 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -33,6 +33,7 @@ KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "lib KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l)) # Check for advanced settings. +KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "compiler_warnings" | wc -l)) KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l)) KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l)) KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_dualview_modify_check" | wc -l)) @@ -78,14 +79,14 @@ KOKKOS_INTERNAL_COMPILER_PGI := $(strip $(shell $(CXX) --version 2 KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)) KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)) KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(CXX) --version 2>&1 | grep nvcc | wc -l)) -KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l)) -KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l)) ifneq ($(OMPI_CXX),) - KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep "nvcc" | wc -l)) + KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep nvcc | wc -l)) endif ifneq ($(MPICH_CXX),) - KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep "nvcc" | wc -l)) + KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep nvcc | wc -l)) endif +KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l)) +KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l)) ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2) KOKKOS_INTERNAL_COMPILER_CLANG = 1 @@ -111,6 +112,36 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) endif endif +# Set compiler warnings flags. +ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + # TODO check if PGI accepts GNU style warnings + KOKKOS_INTERNAL_COMPILER_WARNINGS = + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1) + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + # TODO check if cray accepts GNU style warnings + KOKKOS_INTERNAL_COMPILER_WARNINGS = + else + #gcc + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized + endif + endif + endif + endif + endif +else + KOKKOS_INTERNAL_COMPILER_WARNINGS = +endif + # Set OpenMP flags. ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) KOKKOS_INTERNAL_OPENMP_FLAG := -mp @@ -162,6 +193,7 @@ endif # Intel based. KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_WSM := $(strip $(shell echo $(KOKKOS_ARCH) | grep WSM | wc -l)) KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l)) KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l)) KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l)) @@ -229,13 +261,14 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l)) # Any AVX? +KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) # Decide what ISA level we are able to support. -KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) +KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM)+$(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc )) KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc )) @@ -243,7 +276,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ KOKKOS_INTERNAL_USE_TM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) # Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc )) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc )) KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc)) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) @@ -257,12 +290,10 @@ endif KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src -# No warnings: KOKKOS_CXXFLAGS = -# INTEL and CLANG warnings: -#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -# GCC warnings: -#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered +ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS) +endif KOKKOS_LIBS = -lkokkos -ldl KOKKOS_LDFLAGS = -L$(shell pwd) @@ -486,6 +517,28 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1) + tmp := $(shell echo "\#define KOKKOS_ARCH_SSE42 1" >> KokkosCore_config.tmp ) + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -xSSE4.2 + KOKKOS_LDFLAGS += -xSSE4.2 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += -tp=nehalem + KOKKOS_LDFLAGS += -tp=nehalem + else + # Assume that this is a really a GNU compiler. + KOKKOS_CXXFLAGS += -msse4.2 + KOKKOS_LDFLAGS += -msse4.2 + endif + endif + endif +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp ) @@ -689,7 +742,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif -KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h) +KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1) ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h) KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l)) else diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index 3cb52a04cd3093faf9a495c19de5e22652bd79f4..a9341a907c15009eb270c1b44bb4074f8f1f8cb5 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -20,8 +20,10 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp -Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp +Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp +Kokkos_Rendezvous.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp @@ -36,6 +38,8 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp +Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) diff --git a/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp index 1e7ee68549a2aa439d3c86935409bc3ba6528eba..c2c118ce1a140e69f0c87ddd7c0fc46870b5f58c 100644 --- a/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp @@ -61,14 +61,19 @@ protected: { std::cout << std::setprecision(5) << std::scientific; - unsigned threads_count = omp_get_max_threads(); + int threads_count = 0; + #pragma omp parallel + { + #pragma omp atomic + ++threads_count; + } - if ( Kokkos::hwloc::available() ) { - threads_count = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa(); + if (threads_count > 3) { + threads_count /= 2; } Kokkos::OpenMP::initialize( threads_count ); + Kokkos::OpenMP::print_configuration( std::cout ); } static void TearDownTestCase() diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp index 9cf02f74b4980ee2845e6c055392099f883d3b5b..2771f1793d76371afbab136b9d71641e93131a2b 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -35,7 +35,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER @@ -283,12 +283,12 @@ struct test_random_scalar { RandomGenerator& pool, unsigned int num_draws) { - using std::cerr; + using std::cout; using std::endl; using Kokkos::parallel_reduce; { - cerr << " -- Testing randomness properties" << endl; + cout << " -- Testing randomness properties" << endl; RandomProperties result; typedef test_random_functor<RandomGenerator, Scalar> functor_type; @@ -307,7 +307,7 @@ struct test_random_scalar { ( 1.5*tolerance > variance_eps)) ? 1:0; pass_covar = ((-2.0*tolerance < covariance_eps) && ( 2.0*tolerance > covariance_eps)) ? 1:0; - cerr << "Pass: " << pass_mean + cout << "Pass: " << pass_mean << " " << pass_var << " " << mean_eps << " " << variance_eps @@ -315,7 +315,7 @@ struct test_random_scalar { << " || " << tolerance << endl; } { - cerr << " -- Testing 1-D histogram" << endl; + cout << " -- Testing 1-D histogram" << endl; RandomProperties result; typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type; @@ -335,7 +335,7 @@ struct test_random_scalar { pass_hist1d_covar = ((-0.06 < covariance_eps) && ( 0.06 > covariance_eps)) ? 1:0; - cerr << "Density 1D: " << mean_eps + cout << "Density 1D: " << mean_eps << " " << variance_eps << " " << (result.covariance/HIST_DIM1D/HIST_DIM1D) << " || " << tolerance @@ -348,7 +348,7 @@ struct test_random_scalar { << endl; } { - cerr << " -- Testing 3-D histogram" << endl; + cout << " -- Testing 3-D histogram" << endl; RandomProperties result; typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type; @@ -368,7 +368,7 @@ struct test_random_scalar { pass_hist3d_covar = ((-tolerance < covariance_eps) && ( tolerance > covariance_eps)) ? 1:0; - cerr << "Density 3D: " << mean_eps + cout << "Density 3D: " << mean_eps << " " << variance_eps << " " << result.covariance/HIST_DIM1D/HIST_DIM1D << " || " << tolerance @@ -381,18 +381,18 @@ struct test_random_scalar { template <class RandomGenerator> void test_random(unsigned int num_draws) { - using std::cerr; + using std::cout; using std::endl; typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d"); typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d"); uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count(); - cerr << "Test Seed:" << ticks << endl; + cout << "Test Seed:" << ticks << endl; RandomGenerator pool(ticks); - cerr << "Test Scalar=int" << endl; + cout << "Test Scalar=int" << endl; test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_int.pass_mean,1); ASSERT_EQ( test_int.pass_var,1); @@ -406,7 +406,7 @@ void test_random(unsigned int num_draws) deep_copy(density_1d,0); deep_copy(density_3d,0); - cerr << "Test Scalar=unsigned int" << endl; + cout << "Test Scalar=unsigned int" << endl; test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_uint.pass_mean,1); ASSERT_EQ( test_uint.pass_var,1); @@ -420,7 +420,7 @@ void test_random(unsigned int num_draws) deep_copy(density_1d,0); deep_copy(density_3d,0); - cerr << "Test Scalar=int64_t" << endl; + cout << "Test Scalar=int64_t" << endl; test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_int64.pass_mean,1); ASSERT_EQ( test_int64.pass_var,1); @@ -434,7 +434,7 @@ void test_random(unsigned int num_draws) deep_copy(density_1d,0); deep_copy(density_3d,0); - cerr << "Test Scalar=uint64_t" << endl; + cout << "Test Scalar=uint64_t" << endl; test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_uint64.pass_mean,1); ASSERT_EQ( test_uint64.pass_var,1); @@ -448,7 +448,7 @@ void test_random(unsigned int num_draws) deep_copy(density_1d,0); deep_copy(density_3d,0); - cerr << "Test Scalar=float" << endl; + cout << "Test Scalar=float" << endl; test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_float.pass_mean,1); ASSERT_EQ( test_float.pass_var,1); @@ -462,7 +462,7 @@ void test_random(unsigned int num_draws) deep_copy(density_1d,0); deep_copy(density_3d,0); - cerr << "Test Scalar=double" << endl; + cout << "Test Scalar=double" << endl; test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws); ASSERT_EQ( test_double.pass_mean,1); ASSERT_EQ( test_double.pass_var,1); diff --git a/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp index f952ab3db51028aff0a0ebfe313b2639e353ab87..9e75b580bc0f64b53c402764197e11d1774203d8 100644 --- a/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp +++ b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ diff --git a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp index f545247212ab6057baca8bfb39463daa760747db..8db5ce0eb5e869c565ce8f3872baea5200e4beb4 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp @@ -44,12 +44,13 @@ #include<Kokkos_Core.hpp> #include<impl/Kokkos_Timer.hpp> #include<bench.hpp> +#include<cstdlib> int main(int argc, char* argv[]) { Kokkos::initialize(); - - if(argc<10) { + + if(argc<10) { printf("Arguments: N K R D U F T S\n"); printf(" P: Precision (1==float, 2==double)\n"); printf(" N,K: dimensions of the 2D array to allocate\n"); @@ -68,7 +69,7 @@ int main(int argc, char* argv[]) { Kokkos::finalize(); return 0; } - + int P = atoi(argv[1]); int N = atoi(argv[2]); @@ -80,7 +81,7 @@ int main(int argc, char* argv[]) { int T = atoi(argv[8]); int S = atoi(argv[9]); - if(U>8) {printf("U must be 1-8\n"); return 0;} + if(U>8) {printf("U must be 1-8\n"); return 0;} if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;} if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;} diff --git a/lib/kokkos/benchmarks/gather/main.cpp b/lib/kokkos/benchmarks/gather/main.cpp index 161c6f20919639845adecd96d74d978c65ea952f..88eb0493c1861595069ae518a7fab628a37ce150 100644 --- a/lib/kokkos/benchmarks/gather/main.cpp +++ b/lib/kokkos/benchmarks/gather/main.cpp @@ -44,11 +44,11 @@ #include<Kokkos_Core.hpp> #include<impl/Kokkos_Timer.hpp> #include<gather.hpp> +#include<cstdlib> int main(int argc, char* argv[]) { Kokkos::initialize(argc,argv); - if(argc<8) { printf("Arguments: S N K D\n"); printf(" S: Scalar Type Size (1==float, 2==double, 4=complex<double>)\n"); diff --git a/lib/kokkos/benchmarks/policy_performance/Makefile b/lib/kokkos/benchmarks/policy_performance/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..13aef3209cace8419138d946a919eb893ed9a8d2 --- /dev/null +++ b/lib/kokkos/benchmarks/policy_performance/Makefile @@ -0,0 +1,44 @@ +KOKKOS_PATH = ../.. +SRC = $(wildcard *.cpp) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 -g +LINK = ${CXX} +LINKFLAGS = +EXE = policy_performance.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +KOKKOS_CUDA_OPTIONS+=enable_lambda +else +CXX = g++ +CXXFLAGS = -O3 -g -Wall -Werror +LINK = ${CXX} +LINKFLAGS = +EXE = policy_performance.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +DEPFLAGS = -M + +OBJ = $(SRC:.cpp=.o) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< diff --git a/lib/kokkos/benchmarks/policy_performance/main.cpp b/lib/kokkos/benchmarks/policy_performance/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b0ed9bb512f599a60eaa235bb80de1f1df33c755 --- /dev/null +++ b/lib/kokkos/benchmarks/policy_performance/main.cpp @@ -0,0 +1,170 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include "policy_perf_test.hpp" + +int main(int argc, char* argv[] ) { + Kokkos::initialize(argc,argv); + + if(argc<10) { + printf(" Ten arguments are needed to run this program:\n"); + printf(" (1)team_range, (2)thread_range, (3)vector_range, (4)outer_repeat, (5)thread_repeat, (6)vector_repeat, (7)team_size, (8)vector_size, (9)schedule, (10)test_type\n"); + printf(" team_range: number of teams (league_size)\n"); + printf(" thread_range: range for nested TeamThreadRange parallel_*\n"); + printf(" vector_range: range for nested ThreadVectorRange parallel_*\n"); + printf(" outer_repeat: number of repeats for outer parallel_* call\n"); + printf(" thread_repeat: number of repeats for TeamThreadRange parallel_* call\n"); + printf(" vector_repeat: number of repeats for ThreadVectorRange parallel_* call\n"); + printf(" team_size: number of team members (team_size)\n"); + printf(" vector_size: desired vectorization (if possible)\n"); + printf(" schedule: 1 == Static 2 == Dynamic\n"); + printf(" test_type: 3-digit code XYZ for testing (nested) parallel_*\n"); + printf(" code key: XYZ X in {1,2,3,4,5}, Y in {0,1,2}, Z in {0,1,2}\n"); + printf(" TeamPolicy:\n"); + printf(" X: 0 = none (never used, makes no sense); 1 = parallel_for; 2 = parallel_reduce\n"); + printf(" Y: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n"); + printf(" Z: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n"); + printf(" RangePolicy:\n"); + printf(" X: 3 = parallel_for; 4 = parallel_reduce; 5 = parallel_scan\n"); + printf(" Y: 0 = none\n"); + printf(" Z: 0 = none\n"); + printf(" Example Input:\n"); + printf(" 100000 32 32 100 100 100 8 1 1 100\n"); + Kokkos::finalize(); + return 0; + } + + int team_range = atoi(argv[1]); + int thread_range = atoi(argv[2]); + int vector_range = atoi(argv[3]); + + int outer_repeat = atoi(argv[4]); + int thread_repeat = atoi(argv[5]); + int vector_repeat = atoi(argv[6]); + + int team_size = atoi(argv[7]); + int vector_size = atoi(argv[8]); + int schedule = atoi(argv[9]); + int test_type = atoi(argv[10]); + + int disable_verbose_output = 0; + if ( argc > 11 ) { + disable_verbose_output = atoi(argv[11]); + } + + if ( schedule != 1 && schedule != 2 ) { + printf("schedule: %d\n", schedule); + printf("Options for schedule are: 1 == Static 2 == Dynamic\n"); + Kokkos::finalize(); + return -1; + } + + if ( test_type != 100 && test_type != 110 && test_type != 111 && test_type != 112 && test_type != 120 && test_type != 121 && test_type != 122 + && test_type != 200 && test_type != 210 && test_type != 211 && test_type != 212 && test_type != 220 && test_type != 221 && test_type != 222 + && test_type != 300 && test_type != 400 && test_type != 500 + ) + { + printf("Incorrect test_type option\n"); + Kokkos::finalize(); + return -2; + } + + double result = 0.0; + + Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10,1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, double& lval) { + lval += 1; + }, result); + + typedef Kokkos::View<double*, Kokkos::LayoutRight> view_type_1d; + typedef Kokkos::View<double**, Kokkos::LayoutRight> view_type_2d; + typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d; + + // Allocate view without initializing + // Call a 'warmup' test with 1 repeat - this will initialize the corresponding view appropriately for test and should obey first-touch etc + // Second call to test is the one we actually care about and time + view_type_1d v_1( Kokkos::ViewAllocateWithoutInitializing("v_1"), team_range*team_size); + view_type_2d v_2( Kokkos::ViewAllocateWithoutInitializing("v_2"), team_range*team_size, thread_range); + view_type_3d v_3( Kokkos::ViewAllocateWithoutInitializing("v_3"), team_range*team_size, thread_range, vector_range); + + double result_computed = 0.0; + double result_expect = 0.0; + double time = 0.0; + + if(schedule==1) { + if ( test_type != 500 ) { + // warmup - no repeat of loops + test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + } + else { + // parallel_scan: initialize 1d view for parallel_scan + test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time); + test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + } + } + if(schedule==2) { + if ( test_type != 500 ) { + // warmup - no repeat of loops + test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + } + else { + // parallel_scan: initialize 1d view for parallel_scan + test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time); + test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time); + } + } + + if ( disable_verbose_output == 0 ) { + printf("%7i %4i %2i %9i %4i %4i %4i %2i %1i %3i %e %e %lf\n",team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,schedule,test_type,result_computed,result_expect,time); + } + else { + printf("%lf\n",time); + } + + Kokkos::finalize(); + + return 0; +} diff --git a/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8c79f3b88dabfb8ba420d6f4e2b890e4a10cc304 --- /dev/null +++ b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp @@ -0,0 +1,354 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +template < class ViewType > +struct ParallelScanFunctor { + using value_type = double; + ViewType v; + + ParallelScanFunctor( const ViewType & v_ ) + : v(v_) + {} + + KOKKOS_INLINE_FUNCTION + void operator()( const int idx, value_type& val, const bool& final ) const + { + // inclusive scan + val += v(idx); + if ( final ) { + v(idx) = val; + } + } +}; + +template<class ScheduleType,class IndexType,class ViewType1, class ViewType2, class ViewType3> +void test_policy(int team_range, int thread_range, int vector_range, + int outer_repeat, int thread_repeat, int inner_repeat, + int team_size, int vector_size, int test_type, + ViewType1 &v1, ViewType2 &v2, ViewType3 &v3, + double &result, double &result_expect, double &time) { + + typedef Kokkos::TeamPolicy<ScheduleType,IndexType> t_policy; + typedef typename t_policy::member_type t_team; + Kokkos::Timer timer; + + for(int orep = 0; orep<outer_repeat; orep++) { + + if (test_type == 100) { + Kokkos::parallel_for("100 outer for", t_policy(team_range,team_size), + KOKKOS_LAMBDA (const t_team& team) { + long idx = team.league_rank()*team.team_size() + team.team_rank(); + v1(idx) = idx; + // prevent compiler optimizing loop away + }); + } + + if (test_type == 110) { + Kokkos::parallel_for("110 outer for", t_policy(team_range,team_size), + KOKKOS_LAMBDA (const t_team& team) { + long idx = team.league_rank()*team.team_size() + team.team_rank(); + for (int tr = 0; tr<thread_repeat; ++tr) { + // Each team launches a parallel_for; thread_range is partitioned among team members + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) { + v2( idx, t ) = t; + // prevent compiler optimizing loop away + }); + } + }); + } + if (test_type == 111) { + Kokkos::parallel_for("111 outer for", t_policy(team_range,team_size,vector_size), + KOKKOS_LAMBDA (const t_team& team) { + long idx = team.league_rank()*team.team_size() + team.team_rank(); + for (int tr = 0; tr<thread_repeat; ++tr) { + // Each team launches a parallel_for; thread_range is partitioned among team members + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) { + for (int vr = 0; vr<inner_repeat; ++vr) + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) { + v3( idx, t, vi ) = vi; + // prevent compiler optimizing loop away + }); + }); + } + }); + } + if (test_type == 112) { + Kokkos::parallel_for("112 outer for", t_policy(team_range,team_size,vector_size), + KOKKOS_LAMBDA (const t_team& team) { + long idx = team.league_rank()*team.team_size() + team.team_rank(); + for (int tr = 0; tr<thread_repeat; ++tr) { + // Each team launches a parallel_for; thread_range is partitioned among team members + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) { + double vector_result = 0.0; + for (int vr = 0; vr<inner_repeat; ++vr) { + vector_result = 0.0; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) { + vval += 1; + }, vector_result); + } + v2( idx, t ) = vector_result; + // prevent compiler optimizing loop away + }); + } + }); + } + if (test_type == 120) { + Kokkos::parallel_for("120 outer for", t_policy(team_range,team_size), + KOKKOS_LAMBDA (const t_team& team) { + long idx = team.league_rank()*team.team_size() + team.team_rank(); + double team_result = 0.0; + for (int tr = 0; tr<thread_repeat; ++tr) { + team_result = 0.0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) { + lval += 1; + }, team_result); + } + v1(idx) = team_result; + // prevent compiler optimizing loop away + }); + } + if (test_type == 121) { + Kokkos::parallel_for("121 outer for", t_policy(team_range,team_size,vector_size), + KOKKOS_LAMBDA (const t_team& team) { + long idx = team.league_rank()*team.team_size() + team.team_rank(); + double team_result = 0.0; + for (int tr = 0; tr<thread_repeat; ++tr) { + team_result = 0.0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) { + lval += 1; + for (int vr = 0; vr<inner_repeat; ++vr) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) { + v3( idx, t, vi ) = vi; + // prevent compiler optimizing loop away + }); + } + }, team_result); + } + v3( idx, 0, 0 ) = team_result; + // prevent compiler optimizing loop away + }); + } + if (test_type == 122) { + Kokkos::parallel_for("122 outer for", t_policy(team_range,team_size,vector_size), + KOKKOS_LAMBDA (const t_team& team) { + long idx = team.league_rank()*team.team_size() + team.team_rank(); + double team_result = 0.0; + for (int tr = 0; tr<thread_repeat; ++tr) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) { + double vector_result = 0.0; + for (int vr = 0; vr<inner_repeat; ++vr) + vector_result = 0.0; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) { + vval += 1; + }, vector_result); + lval += vector_result; + }, team_result); + } + v1(idx) = team_result; + // prevent compiler optimizing loop away + }); + } + if (test_type == 200) { + Kokkos::parallel_reduce("200 outer reduce", t_policy(team_range,team_size), + KOKKOS_LAMBDA (const t_team& team, double& lval) { + lval+=team.team_size()*team.league_rank() + team.team_rank(); + },result); + result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1); + // sum ( seq( [0, team_range*team_size) ) + } + if (test_type == 210) { + Kokkos::parallel_reduce("210 outer reduce", t_policy(team_range,team_size), + KOKKOS_LAMBDA (const t_team& team, double& lval) { + long idx = team.league_rank()*team.team_size() + team.team_rank(); + double thread_for = 1.0; + for(int tr = 0; tr<thread_repeat; tr++) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) { + v2(idx,t) = t; + // prevent compiler optimizing loop away + }); + } + lval+=(team.team_size()*team.league_rank() + team.team_rank() + thread_for); + },result); + result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1) + (team_range*team_size); + // sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) ) + } + if (test_type == 211) { + Kokkos::parallel_reduce("211 outer reduce", t_policy(team_range,team_size,vector_size), + KOKKOS_LAMBDA (const t_team& team, double& lval) { + long idx = team.league_rank()*team.team_size() + team.team_rank(); + double thread_for = 1.0; + for(int tr = 0; tr<thread_repeat; tr++) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) { + for (int vr = 0; vr<inner_repeat; ++vr) + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) { + v3(idx, t, vi) = vi; + // prevent compiler optimizing loop away + }); + }); + } + lval+=idx+thread_for; + },result); + result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (team_range*team_size); + // sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) ) + } + if (test_type == 212) { + Kokkos::parallel_reduce("212 outer reduce", t_policy(team_range,team_size,vector_size), + KOKKOS_LAMBDA (const t_team& team, double& lval) { + long idx = team.league_rank()*team.team_size() + team.team_rank(); + double vector_result = 0.0; + for(int tr = 0; tr<thread_repeat; tr++) { + // This parallel_for is executed by each team; the thread_range is partitioned among the team members + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) { + v2(idx,t) = t; + // prevent compiler optimizing loop away + for (int vr = 0; vr<inner_repeat; ++vr) { + vector_result = 0.0; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double &vval) { + vval += vi; + }, vector_result ); + } + }); + } + lval+= idx + vector_result; + },result); + result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (0.5*vector_range*(vector_range-1)*team_range*team_size); + // sum ( seq( [0, team_range*team_size) + sum( seq( [0, vector_range) ) per team_member (total of team_range*team_size) ) + } + if (test_type == 220) { + Kokkos::parallel_reduce("220 outer reduce", t_policy(team_range,team_size), + KOKKOS_LAMBDA (const t_team& team, double& lval) { + double team_result = 0.0; + for(int tr = 0; tr<thread_repeat; tr++) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) { + tval += t; + },team_result); + } + lval+=team_result*team.league_rank(); // constant * league_rank + },result); + result_expect = 0.5*(team_range)*(team_range-1) * team_size * 0.5*(thread_range)*(thread_range-1); + // sum ( seq( [0, team_range) * constant ); constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team) + } + if (test_type == 221) { + Kokkos::parallel_reduce("221 outer reduce", t_policy(team_range,team_size,vector_size), + KOKKOS_LAMBDA (const t_team& team, double& lval) { + long idx = team.league_rank()*team.team_size() + team.team_rank(); + double team_result = 0; + for(int tr = 0; tr<thread_repeat; tr++) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) { + double vector_for = 1.0; + for (int vr = 0; vr<inner_repeat; ++vr) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) { + v3(idx, t, vi) = vi; + // prevent compiler optimizing loop away + }); + } + tval += t + vector_for; + },team_result); + } + lval+=team_result*team.league_rank(); + },result); + result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range); + // sum ( seq( [0, team_range) * constant ) + 1 per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team) + } + if (test_type == 222) { + Kokkos::parallel_reduce("222 outer reduce", t_policy(team_range,team_size,vector_size), + KOKKOS_LAMBDA (const t_team& team, double& lval) { + double team_result = 0.0; + for(int tr = 0; tr<thread_repeat; tr++) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) { + double vector_result = 0.0; + for (int vr = 0; vr<inner_repeat; ++vr) { + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double& vval) { + vval += vi; + }, vector_result); + } + tval += t + vector_result; + },team_result); + } + lval+=team_result*team.league_rank(); + },result); + result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range*0.5*(vector_range)*(vector_range-1)); + // sum ( seq( [0, team_range) * constant ) + 1 + sum( seq([0,vector_range) ) per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team) + } + + // parallel_for RangePolicy: range = team_size*team_range + if (test_type == 300) { + Kokkos::parallel_for("300 outer for", team_size*team_range, + KOKKOS_LAMBDA (const int idx) { + v1(idx) = idx; + // prevent compiler from optimizing away the loop + }); + } + // parallel_reduce RangePolicy: range = team_size*team_range + if (test_type == 400) { + Kokkos::parallel_reduce("400 outer reduce", team_size*team_range, + KOKKOS_LAMBDA (const int idx, double& val) { + val += idx; + }, result); + result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1); + } + // parallel_scan RangePolicy: range = team_size*team_range + if (test_type == 500) { + Kokkos::parallel_scan("500 outer scan", team_size*team_range, + ParallelScanFunctor<ViewType1>(v1) +#if 0 + // This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation + KOKKOS_LAMBDA (const int idx, double& val, const bool& final) { + // inclusive scan + val += v1(idx); + if ( final ) { + v1(idx) = val; + } + } +#endif + ); + // result = v1( team_size*team_range - 1 ); // won't work with Cuda - need to copy result back to host to print + // result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1); + } + + } // end outer for loop + + time = timer.seconds(); +} //end test_policy diff --git a/lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh b/lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh new file mode 100755 index 0000000000000000000000000000000000000000..e621fffbd435bcbdedfc3244250a0330f83bb928 --- /dev/null +++ b/lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Script to check policy_perf_test code works with each possible combo of options + +echo "Performance test results for parallel_reduce code computing sum of sequence [0,N) with various (nested) policies" + +EXECUTABLE=policy_performance + +TEAMRANGE=1000 +THREADRANGE=4 +VECTORRANGE=32 +TEAMSIZE=4 +VECTORSIZE=1 +OREPEAT=1 +MREPEAT=1 +IREPEAT=1 +SCHEDULE=1 + +SUFFIX=host +if [ -e $EXECUTABLE.$SUFFIX ] +then +SCHEDULE=1 +echo "Host tests Static schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done + +SCHEDULE=2 +echo "Host tests Dynamic schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done +fi + +SUFFIX=cuda +if [ -e $EXECUTABLE.$SUFFIX ] +then +SCHEDULE=1 +echo "Cuda tests Static schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done + +SCHEDULE=2 +echo "Cuda tests Dynamic schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done +fi diff --git a/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh b/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh new file mode 100755 index 0000000000000000000000000000000000000000..f4bfb87f8fed1d89a03289754281313aa1e83eeb --- /dev/null +++ b/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +# Sample script for benchmarking policy performance + +# Suggested enviroment variables to export prior to executing script: +# KNL: +# OMP_NUM_THREADS=256 KMP_AFFINITY=compact +# Power: +# OMP_NUM_THREADS=64 OMP_PROC_BIND=true + +# Constants and Variables: +# Vary: TEAMSIZE, and THREADRANGE +# for TEAMSIZE in {1,2,4,5,8}; do +# for THREADRANGE in {32,41,1000}; do +# Fixed: TEAMRANGE, VECTORRANGE, VECTORSIZE +# System specific: Adjust REPEAT values to architecture tests are run on + +# Tests +# Static SCHEDULE = 1 +# Tier 1: parallel_for + RangePolicy 300 +# Tier 2: parallel_reduce, parallel_scan + RangePolicy 400 500 +# Tier 3: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY +# Tier 4: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY +# Dynamic SCHEDULE = 2 +# Tier 5: parallel_for + RangePolicy 300 +# Tier 6: parallel_reduce, parallel_scan + RangePolicy 400 500 +# Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY +# Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY + +# Results grouped by: +# 0) SCHEDULE 1) CODE (test) 2) TEAMRANGE 3) TEAMSIZE 4) THREADRANGE + +EXECUTABLE=policy_performance + +# Default defined values +TEAMRANGE=1000 +THREADRANGE=1 +VECTORRANGE=32 +TEAMSIZE=1 +VECTORSIZE=1 +OREPEAT=1 +MREPEAT=1 +IREPEAT=1 +SCHEDULE=1 + +# Host tests +SUFFIX=host +if [ -e $EXECUTABLE.$SUFFIX ]; then +echo "Host" + +for SCHEDULE in {1,2}; do + +# Tier 1 and 2, 5 and 6 +for CODE in {300,400,500}; do + for TEAMSIZE in {1,2,4,5,8}; do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done +done + +# Tier 3, 7 +for CODE in {100,110,111,112,120,121,122}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +# Tier 4, 8 +for CODE in {200,210,211,212,220,221,222}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +done # end SCHEDULE + +fi # end host + + +# Cuda tests +SUFFIX=cuda +# TEAMRANGE=10000, TEAMSIZE=8 too large +# TEAMRANGE=10000, TEAMSIZE=8, THREADRANGE=1000 too large +if [ -e $EXECUTABLE.$SUFFIX ]; then +echo "Cuda" + +for SCHEDULE in {1,2}; do + +# Reset defaults +TEAMRANGE=1000 +THREADRANGE=1 +VECTORRANGE=32 +TEAMSIZE=1 +VECTORSIZE=1 + +# Tier 1 and 2, 5 and 6 +for CODE in {300,400,500}; do + for TEAMSIZE in {1,2,4,5,8}; do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done +done + +# Tier 3, 7 +for CODE in {100,110,111,112,120,121,122}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +# Tier 4, 8 +for CODE in {200,210,211,212,220,221,222}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +done # end SCHEDULE + +fi #end cuda diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind new file mode 100755 index 0000000000000000000000000000000000000000..ca34648780174d626bc2b04dbbbb282eda3f9dff --- /dev/null +++ b/lib/kokkos/bin/hpcbind @@ -0,0 +1,454 @@ +#!/usr/bin/env bash + +################################################################################ +# Check if hwloc commands exist +################################################################################ +declare -i HPCBIND_HAS_HWLOC=1 +type hwloc-bind >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-distrib >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-ls >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-calc >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-ps >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then + echo "hwloc not found, no process binding will occur" +fi + +# Get parent cpuset +HPCBIND_HWLOC_PARENT_CPUSET="" +if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + MY_PID="$BASHPID" + HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) +fi + +################################################################################ +# Check if nvidia-smi exist +################################################################################ +declare -i HPCBIND_HAS_NVIDIA=0 +type nvidia-smi >/dev/null 2>&1 +HPCBIND_HAS_NVIDIA=$((!$?)) + + +################################################################################ +# Get visible gpu +################################################################################ +declare -i NUM_GPUS=0 +HPCBIND_VISIBLE_GPUS="" +if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then + NUM_GPUS=$(nvidia-smi -L | wc -l); + GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )" + HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}} +fi + +declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0)) + + +################################################################################ +# Get queue id +# supports sbatch, bsub, aprun +################################################################################ +HPCBIND_QUEUE_NAME="" +declare -i HPCBIND_QUEUE_INDEX=0 +declare -i HPCBIND_QUEUE_GPU_MAPPING=0 + +if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then + HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_NAME="sbatch" + HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID} +elif [[ ! -z "${LBS_JOBINDEX}" ]]; then + HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_NAME="bsub" + HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX} +elif [[ ! -z "${ALPS_APP_PE}" ]]; then + HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_NAME="aprun" + HPCBIND_QUEUE_INDEX=${ALPS_APP_PE} +fi + + +################################################################################ +# Show help +################################################################################ +function show_help { + local cmd=$(basename "$0") + echo "Usage: ${cmd} <options> -- command ..." + echo " Set the process mask, OMP environment variables and CUDA environment" + echo " variables to sane values if possible. Uses hwloc and nvidia-smi if" + echo " available. Will preserve the current process binding, so it is safe" + echo " to use with a queuing system or mpiexec." + echo "" + echo "Options:" + echo " --no-hwloc-bind Disable binding" + echo " --proc-bind=<LOC> Set the initial process mask for the script" + echo " LOC can be any valid location argument for" + echo " hwloc-calc Default: all" + echo " --distribute=N Distribute the current cpuset into N partitions" + echo " --distribute-partition=I" + echo " Use the i'th partition (zero based)" + echo " --visible-gpus=<L> Comma separated list of gpu ids" + echo " Default: CUDA_VISIBLE_DEVICES or all gpus in" + echo " sequential order" + echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU" + echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES" + echo " --openmp=M.m Set env variables for the given OpenMP version" + echo " Default: 4.0" + echo " --openmp-percent=N Integer percentage of cpuset to use for OpenMP" + echo " threads Default: 100" + echo " --openmp-places=<Op> Op=threads|cores|sockets. Default: threads" + echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES" + echo " --force-openmp-num-threads=N" + echo " Override logic for selecting OMP_NUM_THREADS" + echo " --force-openmp-proc-bind=<OP>" + echo " Override logic for selecting OMP_PROC_BIND" + echo " --no-openmp-nested Set OMP_NESTED to false" + echo " --show-bindings Show the bindings" + echo " --lstopo Show bindings in lstopo without executing a command" + echo " -v|--verbose Show options and relevant environment variables" + echo " -h|--help Show this message" + echo "" + echo "Sample Usage:" + echo " Split the current process cpuset into 4 and use the 3rd partition" + echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..." + echo " Bing the process to all even cores" + echo " ${cmd} --proc-bind=core:even -v -- command ..." + echo " Bind to the first 64 cores and split the current process cpuset into 4" + echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..." + echo " skip GPU 0 when mapping visible devices" + echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..." + echo " Display the current bindings" + echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command" + echo " Display the current bindings using lstopo" + echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo" + echo "" +} + + +################################################################################ +# Parse command line arguments +################################################################################ +# Show help if no command line arguments given +if [[ "$#" -eq 0 ]]; then + show_help + exit 0 +fi + +declare -a UNKNOWN_ARGS=() +declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC} +declare -i HPCBIND_DISTRIBUTE=1 +declare -i HPCBIND_PARTITION=0 +HPCBIND_PROC_BIND="all" +HPCBIND_OPENMP_VERSION=4.0 +declare -i HPCBIND_OPENMP_PERCENT=100 +HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads} +declare -i HPCBIND_OPENMP_PROC_BIND=1 +declare -i HPCBIND_OPENMP_FORCE_NUM_THREADS=-1 +HPCBIND_OPENMP_FORCE_PROC_BIND="" +HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true} +declare -i HPCBIND_VERBOSE=0 + +declare -i HPCBIND_SHOW_BINDINGS=0 +declare -i HPCBIND_LSTOPO=0 + +for i in $@; do + case $i in + # number of partitions to create + --no-hwloc-bind) + HPCBIND_ENABLE_HWLOC_BIND=0 + shift + ;; + --proc-bind=*) + HPCBIND_PROC_BIND="${i#*=}" + shift + ;; + --distribute=*) + HPCBIND_DISTRIBUTE="${i#*=}" + shift + ;; + # which partition to use + --distribute-partition=*) + HPCBIND_PARTITION="${i#*=}" + shift + ;; + --visible-gpus=*) + HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ') + shift + ;; + --gpu-ignore-queue) + HPCBIND_QUEUE_GPU_MAPPING=0 + shift + ;; + --no-gpu-mapping) + HPCBIND_ENABLE_GPU_MAPPING=0 + shift + ;; + --openmp=*) + HPCBIND_OPENMP_VERSION="${i#*=}" + shift + ;; + --openmp-percent=*) + HPCBIND_OPENMP_PERCENT="${i#*=}" + shift + ;; + --openmp-places=*) + HPCBIND_OPENMP_PLACES="${i#*=}" + shift + ;; + --no-openmp-proc-bind) + HPCBIND_OPENMP_PROC_BIND=0 + shift + ;; + --force-openmp-proc-bind=*) + HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}" + shift + ;; + --force-openmp-num-threads=*) + HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}" + shift + ;; + --no-openmp-nested) + HPCBIND_OPENMP_NESTED="false" + shift + ;; + --show-bindings) + HPCBIND_VERBOSE=1 + HPCBIND_SHOW_BINDINGS=1 + shift + ;; + --lstopo) + HPCBIND_VERBOSE=1 + HPCBIND_SHOW_BINDINGS=0 + HPCBIND_LSTOPO=1 + shift + ;; + -v|--verbose) + HPCBIND_VERBOSE=1 + shift + ;; + -h|--help) + show_help + exit 0 + ;; + # ignore remaining arguments + --) + shift + break + ;; + # unknown option + *) + UNKNOWN_ARGS+=("$i") + shift + ;; + esac +done + + +################################################################################ +# Check unknown arguments +################################################################################ +if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then + echo "Uknown options: ${UNKNOWN_ARGS[*]}" + exit 1 +fi + + +################################################################################ +# Check that visible gpus are valid +################################################################################ +HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS}) +if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then + for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do + if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} || + ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then + echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0" + HPCBIND_VISIBLE_GPUS[$i]=0; + fi + done + NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]} +fi + + +################################################################################ +# Check OpenMP percent +################################################################################ +if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then + echo "OpenMP percent < 1, setting to 1" + HPCBIND_OPENMP_PERCENT=1 +elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then + echo "OpenMP percent > 100, setting to 100" + HPCBIND_OPENMP_PERCENT=100 +fi + +################################################################################ +# Check distribute +################################################################################ +if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then + echo "Invalid input for distribute, changing distribute to 1" + HPCBIND_DISTRIBUTE=1 +fi + +if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then + echo "Invalid input for distribute-partition, changing to 0" + HPCBIND_PARTITION=0 +fi + + +################################################################################ +# Find cpuset and num threads +################################################################################ +HPCBIND_HWLOC_CPUSET="" +declare -i HPCBIND_NUM_PUS=0 + +if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then + BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND}) + else + BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND}) + fi + + CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE})) + HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]} + HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l) +else + HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor) +fi + +declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_PERCENT)) +HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_OPENMP_NUM_THREADS / 100)) + + +if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then + HPCBIND_OPENMP_NUM_THREADS=1 +elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then + HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS} +fi + +if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} -gt 0 ]]; then + HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS} +fi + +################################################################################ +# Set OpenMP environment variables +################################################################################ + +# set OMP_NUM_THREADS +export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS} + +# set OMP_PROC_BIND and OMP_PLACES +if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then + if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then + #default proc bind logic + if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then + export OMP_PLACES="${HPCBIND_OPENMP_PLACES}" + export OMP_PROC_BIND="spread" + else + export OMP_PROC_BIND="true" + unset OMP_PLACES + fi + else + #force proc bind + export OMP_PLACES="${HPCBIND_OPENMP_PLACES}" + export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}" + fi +else + # no openmp proc bind + unset OMP_PLACES + unset OMP_PROC_BIND +fi + +# set OMP_NESTED +export OMP_NESTED=${HPCBIND_OPENMP_NESTED} + + +################################################################################ +# Set CUDA environment variables +################################################################################ + +if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then + if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then + declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS)) + export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + else + declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) + declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS)) + export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + fi +fi + +################################################################################ +# Set hpcbind environment variables +################################################################################ +export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC} +export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA} +export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS} +export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET} +export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE} +export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION} +if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then + export HPCBIND_HWLOC_PARENT_CPUSET="all" +else + export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET} +fi +export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND} +export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING} +export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',') +export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION} +if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then + export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX} + export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME} + export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING} +fi + + +################################################################################ +# Print verbose +################################################################################ + +if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then + MY_ENV=$(env | sort) + echo "[HPCBIND]" + echo "${MY_ENV}" | grep -E "^HPCBIND_" + echo "[CUDA]" + echo "${MY_ENV}" | grep -E "^CUDA_" + echo "[OPENMP]" + echo "${MY_ENV}" | grep -E "^OMP_" +fi + +if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then + echo "[BINDINGS]" + hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu +elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then + echo "Unable to show bindings, hwloc not available." +fi + +################################################################################ +# Run command +################################################################################ + +if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@ + else + eval $@ + fi +else + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then + echo "[BINDINGS]" + hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu + hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0 + else + hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} + fi + else + echo "Unable to show bindings, hwloc not available." + fi +fi diff --git a/lib/kokkos/bin/kokkos-bind b/lib/kokkos/bin/kokkos-bind new file mode 100755 index 0000000000000000000000000000000000000000..b6fe07a1bd1c55d864c66d292da3782cb23eb0a5 --- /dev/null +++ b/lib/kokkos/bin/kokkos-bind @@ -0,0 +1,221 @@ +#!/usr/bin/env bash + +# check if hwloc commands exist +declare -i HAS_HWLOC=0 +type hwloc-bind >/dev/null 2>&1 +HAS_HWLOC="${HAS_HWLOC} + $?" + +type hwloc-distrib >/dev/null 2>&1 +HAS_HWLOC="${HAS_HWLOC} + $?" + +type hwloc-ls >/dev/null 2>&1 +HAS_HWLOC="${HAS_HWLOC} + $?" + +type hwloc-calc >/dev/null 2>&1 +HAS_HWLOC="${HAS_HWLOC} + $?" + +type hwloc-ps >/dev/null 2>&1 +HAS_HWLOC="${HAS_HWLOC} + $?" + + +#parse args +declare -a UNKNOWN_ARGS=() +declare -i DISTRIBUTE=1 +declare -i INDEX=0 +PROC_BIND="all" +CURRENT_CPUSET="" +OPENMP_VERSION=4.0 +OPENMP_PROC_BIND=True +OPENMP_NESTED=True +VERBOSE=False + +#get the current process cpuset +if [[ ${HAS_HWLOC} -eq 0 ]]; then + MY_PID="$BASHPID" + CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) + echo "$CURRENT_CPUSET" +fi + +function show_help { + local cmd=$(basename "$0") + echo "Usage: ${cmd} <options> -- command ..." + echo " Uses hwloc to divide the node into the given number of groups," + echo " set the appropriate OMP_NUM_THREADS and execute the command on the" + echo " selected group." + echo "" + echo " NOTE: This command assumes it has exclusive use of the node" + echo "" + echo "Options:" + echo " --proc-bind=<LOC> Set the initial process mask for the script. " + echo " LOC can be any valid location argumnet for" + echo " hwloc-calc. Defaults to the entire machine" + echo " --distribute=N Distribute the current proc-bind into N groups" + echo " --index=I Use the i'th group (zero based)" + echo " --openmp=M.m Set env variables for the given OpenMP version" + echo " (default 4.0)" + echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES" + echo " --no-openmp-nested Set OMP_NESTED to false" + echo " -v|--verbose" + echo " -h|--help" + echo "" + echo "Sample Usage:" + echo " ${cmd} --distribute=4 --index=2 -v -- command ..." + echo "" +} + +if [[ "$#" -eq 0 ]]; then + show_help + exit 0 +fi + + +for i in $@; do + case $i in + # number of partitions to create + --proc-bind=*) + PROC_BIND="${i#*=}" + shift + ;; + --distribute=*) + DISTRIBUTE="${i#*=}" + shift + ;; + # which group to use + --index=*) + INDEX="${i#*=}" + shift + ;; + --openmp=*) + OPENMP_VERSION="${i#*=}" + shift + ;; + --no-openmp-proc-bind) + OPENMP_PROC_BIND=False + shift + ;; + --no-openmp-nested) + OPENMP_NESTED=False + shift + ;; + -v|--verbose) + VERBOSE=True + shift + ;; + -h|--help) + show_help + exit 0 + ;; + # ignore remaining arguments + --) + shift + break + ;; + # unknown option + *) + UNKNOWN_ARGS+=("$i") + shift + ;; + esac +done + +if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then + echo "Uknown options: ${UNKNOWN_ARGS[*]}" + exit 1 +fi + +if [[ ${DISTRIBUTE} -le 0 ]]; then + echo "Invalid input for distribute, changing distribute to 1" + DISTRIBUTE=1 +fi + +if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then + echo "Invalid input for index, changing index to 0" + INDEX=0 +fi + +if [[ ${HAS_HWLOC} -ne 0 ]]; then + echo "hwloc not found, no process binding will occur" + DISTRIBUTE=1 + INDEX=0 +fi + +if [[ ${HAS_HWLOC} -eq 0 ]]; then + + if [[ "${CURRENT_CPUSET}" == "" ]]; then + BINDING=$(hwloc-calc ${PROC_BIND}) + else + BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND}) + fi + + CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE})) + CPUSET=${CPUSETS[${INDEX}]} + NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l) + + if [[ "${VERBOSE}" == "True" ]]; then + echo "hwloc: true" + echo " proc_bind: ${PROC_BIND}" + echo " distribute: ${DISTRIBUTE}" + echo " index: ${INDEX}" + echo " parent_cpuset: ${CURRENT_CPUSET}" + echo " cpuset: ${CPUSET}" + echo "omp_num_threads: ${NUM_THREADS}" + echo "omp_proc_bind: ${OPENMP_PROC_BIND}" + echo "omp_nested: ${OPENMP_NESTED}" + echo "OpenMP: ${OPENMP_VERSION}" + fi + + # set OMP env + if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then + if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then + export OMP_PLACES="threads" + export OMP_PROC_BIND="spread" + else + export OMP_PROC_BIND="true" + unset OMP_PLACES + fi + else + unset OMP_PLACES + unset OMP_PROC_BIND + fi + if [[ "${OPENMP_NESTED}" == "True" ]]; then + export OMP_NESTED="true" + else + export OMP_NESTED="false" + fi + export OMP_NUM_THREADS="${NUM_THREADS}" + + hwloc-bind ${CPUSET} -- $@ +else + NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor) + + if [[ "${VERBOSE}" == "True" ]]; then + echo "hwloc: false" + echo "omp_num_threads: ${NUM_THREADS}" + echo "omp_proc_bind: ${OPENMP_PROC_BIND}" + echo "omp_nested: ${OPENMP_NESTED}" + echo "OpenMP: ${OPENMP_VERSION}" + fi + + # set OMP env + if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then + if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then + export OMP_PLACES="threads" + export OMP_PROC_BIND="spread" + else + export OMP_PROC_BIND="true" + unset OMP_PLACES + fi + else + unset OMP_PLACES + unset OMP_PROC_BIND + fi + if [[ "${OPENMP_NESTED}" == "True" ]]; then + export OMP_NESTED="true" + else + export OMP_NESTED="false" + fi + export OMP_NUM_THREADS="${NUM_THREADS}" + + eval $@ +fi + diff --git a/lib/kokkos/bin/runtest b/lib/kokkos/bin/runtest new file mode 100755 index 0000000000000000000000000000000000000000..92411fe5badf5398b3e2cee325161f225d98f33a --- /dev/null +++ b/lib/kokkos/bin/runtest @@ -0,0 +1,165 @@ +#!/usr/bin/env bash + +function get_path() { + cd "$(dirname "$0")" + cd .. + echo "$(pwd -P)" +} + +KOKKOS_PATH="$(get_path "$0")" + +function show_help() { + local cmd=$(basename "$0") + echo "Usage: ${cmd} <options> " + echo " Build and run the tests" + echo "" + echo "Options:" + echo " -j=N|--make-j=N Build the tests in parallel" + echo " -c|--clean Clean build and regenerate make files" + echo " --clean-on-pass Clean build when runtest passes" + echo " --output-prefix=<pre> Prefix of log files Default: runtest" + echo " --build-only Only build the tests" + echo " -v|--verbose Tee STDOUT and STDERR to screen and files" + echo " -h|--help Show this message" + echo "" + ${KOKKOS_PATH}/generate_makefile.bash --help + return 0 +} + + +declare -a GENERATE_ARGS=() +declare -i VERBOSE=0 +declare -i CLEAN=0 +declare -i CLEAN_ON_PASS=0 +declare -i BUILD_ONLY=0 +OUTPUT="runtest" + +declare -i MAKE_J=${HPCBIND_NUM_PUS:-1} + +for i in $@; do + case $i in + -j=*|--make-j=*) + MAKE_J=${i#*=} + shift + ;; + -c|--clean) + CLEAN=1 + shift + ;; + --clean-on-pass) + CLEAN_ON_PASS=1 + shift + ;; + --output-prefix=*) + OUTPUT=${i#*=} + shift + ;; + --build-only) + BUILD_ONLY=1 + shift + ;; + -v|--verbose) + VERBOSE=1 + shift + ;; + -h|--help) + show_help + exit 0 + ;; + *) + GENERATE_ARGS+=("$i") + shift + ;; + esac +done + +if [[ "$(pwd -P)" == ${KOKKOS_PATH} ]]; then + echo "Cannot call $0 from root repository path ${KOKKOS_PATH}" + exit 1 +fi + +# Some makefile dependencies are incorrect, so clean needs to force +# a new call to generate_makefiles.bash +if [[ ${CLEAN} -eq 1 ]]; then + START=${SECONDS} + echo "Cleaning" + /bin/rm -rf algorithms containers core example install Makefile >/dev/null 2>&1 + END=${SECONDS} + echo " $((END-START)) seconds" + if [[ ${VERBOSE} -eq 1 ]]; then + echo "" + echo "" + fi +fi + +declare -i START=${SECONDS} +echo "Generating Makefile" +echo " ${KOKKOS_PATH}/generate_makefile.bash --kokkos-path=${KOKKOS_PATH} ${GENERATE_ARGS[@]}" + +if [[ ${VERBOSE} -eq 0 ]]; then + "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > ${OUTPUT}.out 2> >(tee ${OUTPUT}.err >&2) +else + "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > >(tee ${OUTPUT}.out) 2> >(tee ${OUTPUT}.err >&2) +fi +declare -i RESULT=$? +declare -i END=${SECONDS} +if [[ ${RESULT} -eq 0 ]]; then + echo " PASS: $((END-START)) seconds" + if [[ ${VERBOSE} -eq 1 ]]; then + echo "" + echo "" + fi +else + cat ${OUTPUT}.out | grep "FAIL" + cat ${OUTPUT}.err | grep "FAIL" + echo " FAIL: $((END-START)) seconds" + exit 1 +fi + +START=${SECONDS} +echo "Building" +if [[ ${VERBOSE} -eq 0 ]]; then + make --keep-going -j ${MAKE_J} build-test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2) +else + make --keep-going -j ${MAKE_J} build-test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2) +fi +RESULT=$? +END=${SECONDS} +if [[ ${RESULT} -eq 0 ]]; then + echo " PASS: $((END-START)) seconds" + if [[ ${VERBOSE} -eq 1 ]]; then + echo "" + echo "" + fi +else + cat ${OUTPUT}.out | grep -E "[[:space:]]error:[[:space:]]" + cat ${OUTPUT}.err | grep -E "[[:space:]]error:[[:space:]]" + echo " FAIL: $((END-START)) seconds" + exit 1 +fi + +if [[ ${BUILD_ONLY} -eq 0 ]]; then + START=${SECONDS} + echo "Testing" + if [[ ${VERBOSE} -eq 0 ]]; then + make --keep-going test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2) + else + make --keep-going test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2) + fi + RESULT=$? + END=${SECONDS} + if [[ ${RESULT} -eq 0 ]]; then + echo " PASS: $((END-START)) seconds" + if [[ ${CLEAN_ON_PASS} -eq 1 ]]; then + make clean + fi + else + cat ${OUTPUT}.out | grep "FAIL" + cat ${OUTPUT}.err | grep "FAIL" + echo " FAIL: $((END-START)) seconds" + exit 1 + fi +fi + +exit ${RESULT} + diff --git a/lib/kokkos/cmake/kokkos.cmake b/lib/kokkos/cmake/kokkos.cmake index 235b7eaba47f295aaa712cdd07bc8318a3731dbf..396822c7fa127d9bca063118061a9904c3c14d91 100644 --- a/lib/kokkos/cmake/kokkos.cmake +++ b/lib/kokkos/cmake/kokkos.cmake @@ -999,8 +999,12 @@ SET (Kokkos_INCLUDE_DIRS ${Kokkos_SOURCE_DIR}/containers/src ${Kokkos_SOURCE_DIR}/algorithms/src ${Kokkos_BINARY_DIR} # to find KokkosCore_config.h + ${KOKKOS_INCLUDE_DIRS} ) +# pass include dirs back to parent scope +SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE) + INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS}) IF(KOKKOS_SEPARATE_LIBS) diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt index cc6f4c97d74930de20e63dd39d7879bdfde728c6..0447db4b2b7bf7638de2ab89d082b4faa6ba2bfe 100644 --- a/lib/kokkos/config/master_history.txt +++ b/lib/kokkos/config/master_history.txt @@ -7,3 +7,4 @@ tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966 tag: 2.02.15 date: 02:10:2017 master: 8c64cd93 develop: 28dea8b6 tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641 tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186 +tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a diff --git a/lib/kokkos/config/query_cuda_arch.cpp b/lib/kokkos/config/query_cuda_arch.cpp new file mode 100644 index 0000000000000000000000000000000000000000..383f04e34e08949142e33ede90a6e294642c1ba8 --- /dev/null +++ b/lib/kokkos/config/query_cuda_arch.cpp @@ -0,0 +1,24 @@ +#include <cstdio> +#include <cuda_runtime_api.h> +int main() +{ + cudaDeviceProp prop; + const cudaError_t err_code = cudaGetDeviceProperties(&prop, 0); + if (cudaSuccess != err_code) { + fprintf(stderr,"cudaGetDeviceProperties failed: %s\n", cudaGetErrorString(err_code)); + return -1; + } + switch (prop.major) { + case 3: + printf("Kepler"); break; + case 5: + printf("Maxwell"); break; + case 6: + printf("Pascal"); break; + default: + fprintf(stderr, "Unspported Device %d%d\n", (int)prop.major, (int)prop.minor); + return -1; + } + printf("%d%d\n", (int)prop.major, (int)prop.minor); + return 0; +} diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia index 8e1246bf8bd85b36f0fae947a1ee280e820426e9..005cd2072132cf50e7d73fa92767a9df5956a0db 100755 --- a/lib/kokkos/config/test_all_sandia +++ b/lib/kokkos/config/test_all_sandia @@ -160,9 +160,14 @@ if [ "$MACHINE" = "sems" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" @@ -280,13 +285,13 @@ elif [ "$MACHINE" = "apollo" ]; then "gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS" + "clang/4.0.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS" "cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) else # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" + "clang/4.0.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" "gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" @@ -584,7 +589,7 @@ single_build_and_test() { else run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local -i build_start_time=$(date +%s) - run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } + run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } local -i build_end_time=$(date +%s) comment="build_time=$(($build_end_time-$build_start_time))" diff --git a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel index 23968e8c0f8581866c6f7ed99ef3417ffc4c0442..6527df2eb9d9a8814ce996d7fe96d6e90f46eb40 100755 --- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel +++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel @@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=ON export JENKINS_DO_SERIAL=OFF export JENKINS_DO_COMPLEX=OFF -export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl" -export ARCH_C_FLAG="-xCORE-AVX2 -mkl" +export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl" +export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl" export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a" export LAPACK_LIBRARIES=${BLAS_LIBRARIES} export JENKINS_DO_TESTS=ON export JENKINS_DO_EXAMPLES=ON -export JENKINS_DO_SHARED=OFF +export JENKINS_DO_SHARED=ON export QUEUE=haswell diff --git a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel index 964de3a0026f7ccf7fdbc776e546d3787c856c53..1a306bc2b20bb3998d89391239f3d34249336213 100755 --- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel +++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel @@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=OFF export JENKINS_DO_SERIAL=ON export JENKINS_DO_COMPLEX=ON -export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl" -export ARCH_C_FLAG="-xCORE-AVX2 -mkl" +export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl" +export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl" export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a" export LAPACK_LIBRARIES=${BLAS_LIBRARIES} export JENKINS_DO_TESTS=ON export JENKINS_DO_EXAMPLES=ON -export JENKINS_DO_SHARED=OFF +export JENKINS_DO_SHARED=ON export QUEUE=haswell diff --git a/lib/kokkos/containers/performance_tests/Makefile b/lib/kokkos/containers/performance_tests/Makefile index edaaf1ee51f8bbe5b41a1efb418a0fb83dd1de0b..ec69363a17deea398adda0e0b6c7b39732ddb4e1 100644 --- a/lib/kokkos/containers/performance_tests/Makefile +++ b/lib/kokkos/containers/performance_tests/Makefile @@ -60,7 +60,6 @@ test-threads: KokkosContainers_PerformanceTest_Threads test-openmp: KokkosContainers_PerformanceTest_OpenMP ./KokkosContainers_PerformanceTest_OpenMP - build_all: $(TARGETS) test: $(TEST_TARGETS) diff --git a/lib/kokkos/containers/performance_tests/TestMain.cpp b/lib/kokkos/containers/performance_tests/TestMain.cpp index f952ab3db51028aff0a0ebfe313b2639e353ab87..1224af7cdb5484101dea69f155810caccf2258cb 100644 --- a/lib/kokkos/containers/performance_tests/TestMain.cpp +++ b/lib/kokkos/containers/performance_tests/TestMain.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,12 +36,15 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ #include <gtest/gtest.h> +#include <cstdlib> + +#include <Kokkos_Macros.hpp> int main(int argc, char *argv[]) { ::testing::InitGoogleTest(&argc,argv); diff --git a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp index b674ec4a7450b6a3ef0f4077f837ab8d51c92d9e..6631184624f4b8830951e3dca222dab925a19fc3 100644 --- a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp +++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp @@ -69,30 +69,13 @@ protected: { std::cout << std::setprecision(5) << std::scientific; - unsigned num_threads = 4; - - if (Kokkos::hwloc::available()) { - num_threads = Kokkos::hwloc::get_available_numa_count() - * Kokkos::hwloc::get_available_cores_per_numa() - * Kokkos::hwloc::get_available_threads_per_core() - ; - - } - - std::cout << "OpenMP: " << num_threads << std::endl; - - Kokkos::OpenMP::initialize( num_threads ); - - std::cout << "available threads: " << omp_get_max_threads() << std::endl; + Kokkos::OpenMP::initialize(); + Kokkos::OpenMP::print_configuration( std::cout ); } static void TearDownTestCase() { Kokkos::OpenMP::finalize(); - - omp_set_num_threads(1); - - ASSERT_EQ( 1 , omp_get_max_threads() ); } }; diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index 937eab0d889d70ae9b289a12a7083037601347d0..35cc8ec7534b8ad1bdc854f338826dabc0e9132a 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -564,7 +564,7 @@ namespace Impl { template< class D, class A1, class A2, class A3, class ... Args > struct DualViewSubview { - typedef typename Kokkos::Experimental::Impl::ViewMapping + typedef typename Kokkos::Impl::ViewMapping < void , Kokkos::ViewTraits< D, A1, A2, A3 > , Args ... diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index 8e464506f9da6ed12278ed6435f48f63ab56e6aa..d22d6b865da99fa11a050e0bedfc5e088001a3ea 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -46,19 +46,6 @@ /// /// This header file declares and defines Kokkos::Experimental::DynRankView and its /// related nonmember functions. -/* - * Changes from View - * 1. The rank of the DynRankView is returned by the method rank() - * 2. Max rank of a DynRankView is 7 - * 3. subview name is subdynrankview - * 4. Every subdynrankview is returned with LayoutStride - * - * NEW: Redesigned DynRankView - * 5. subview function name now available - * 6. Copy and Copy-Assign View to DynRankView - * 7. deep_copy between Views and DynRankViews - * 8. rank( view ); returns the rank of View or DynRankView - */ #ifndef KOKKOS_DYNRANKVIEW_HPP #define KOKKOS_DYNRANKVIEW_HPP @@ -117,6 +104,14 @@ struct DynRankDimTraits { , layout.dimension[7] ); } + // Extra overload to match that for specialize types v2 + template <typename Layout, typename ... P> + KOKKOS_INLINE_FUNCTION + static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout ) + { + return computeRank(layout); + } + // Create the layout for the rank-7 view. // Non-strided Layout template <typename Layout> @@ -158,8 +153,17 @@ struct DynRankDimTraits { ); } + // Extra overload to match that for specialize types + template <typename Traits, typename ... P> + KOKKOS_INLINE_FUNCTION + static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout ) + { + return createLayout( layout ); + } + // Create a view from the given dimension arguments. // This is only necessary because the shmem constructor doesn't take a layout. + // NDE shmem View's are not compatible with the added view_alloc value_type / fad_dim deduction functionality template <typename ViewType, typename ViewArg> static ViewType createView( const ViewArg& arg , const size_t N0 @@ -186,7 +190,8 @@ struct DynRankDimTraits { // Non-strided Layout template <typename Layout , typename iType> KOKKOS_INLINE_FUNCTION - static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank ) + static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type + reconstructLayout( const Layout& layout , iType dynrank ) { return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0) , dynrank > 1 ? layout.dimension[1] : ~size_t(0) @@ -202,7 +207,8 @@ struct DynRankDimTraits { // LayoutStride template <typename Layout , typename iType> KOKKOS_INLINE_FUNCTION - static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank ) + static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type + reconstructLayout( const Layout& layout , iType dynrank ) { return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0) , dynrank > 0 ? layout.stride[0] : (0) @@ -311,6 +317,11 @@ void dyn_rank_view_verify_operator_bounds /** \brief Assign compatible default mappings */ struct ViewToDynRankViewTag {}; +} // namespace Impl +} // namespace Experimental + +namespace Impl { + template< class DstTraits , class SrcTraits > class ViewMapping< DstTraits , SrcTraits , typename std::enable_if<( @@ -337,7 +348,7 @@ class ViewMapping< DstTraits , SrcTraits , ) ) ) - ) , ViewToDynRankViewTag >::type > + ) , Kokkos::Experimental::Impl::ViewToDynRankViewTag >::type > { private: @@ -376,7 +387,7 @@ public: typedef typename DstType::offset_type dst_offset_type ; dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc - dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track ); + dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track ); dst.m_track.assign( src.m_track , DstTraits::is_managed ); dst.m_rank = src.Rank ; } @@ -384,22 +395,20 @@ public: } //end Impl +namespace Experimental { + /* \class DynRankView * \brief Container that creates a Kokkos view with rank determined at runtime. - * Essentially this is a rank 7 view that wraps the access operators - * to yield the functionality of a view + * Essentially this is a rank 7 view * * Changes from View * 1. The rank of the DynRankView is returned by the method rank() * 2. Max rank of a DynRankView is 7 - * 3. subview name is subdynrankview - * 4. Every subdynrankview is returned with LayoutStride - * - * NEW: Redesigned DynRankView - * 5. subview function name now available - * 6. Copy and Copy-Assign View to DynRankView - * 7. deep_copy between Views and DynRankViews - * 8. rank( view ); returns the rank of View or DynRankView + * 3. subview called with 'subview(...)' or 'subdynrankview(...)' (backward compatibility) + * 4. Every subview is returned with LayoutStride + * 5. Copy and Copy-Assign View to DynRankView + * 6. deep_copy between Views and DynRankViews + * 7. rank( view ); returns the rank of View or DynRankView * */ @@ -427,7 +436,7 @@ public: private: - typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ; + typedef Kokkos::Impl::ViewMapping< traits , void > map_type ; typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ; track_type m_track ; @@ -556,7 +565,7 @@ public: // Allow specializations to query their specialized map KOKKOS_INLINE_FUNCTION - const Kokkos::Experimental::Impl::ViewMapping< traits , void > & + const Kokkos::Impl::ViewMapping< traits , void > & implementation_map() const { return m_map ; } //---------------------------------------- @@ -803,7 +812,7 @@ public: , m_rank(rhs.m_rank) { typedef typename DynRankView<RT,RP...> ::traits SrcTraits ; - typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" ); Mapping::assign( m_map , rhs.m_map , rhs.m_track ); } @@ -813,7 +822,7 @@ public: DynRankView & operator = (const DynRankView<RT,RP...> & rhs ) { typedef typename DynRankView<RT,RP...> ::traits SrcTraits ; - typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" ); Mapping::assign( m_map , rhs.m_map , rhs.m_track ); m_track.assign( rhs.m_track , traits::is_managed ); @@ -831,7 +840,7 @@ public: , m_rank( rhs.Rank ) { typedef typename View<RT,RP...>::traits SrcTraits ; - typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ; + typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ; static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" ); Mapping::assign( *this , rhs ); } @@ -841,7 +850,7 @@ public: DynRankView & operator = ( const View<RT,RP...> & rhs ) { typedef typename View<RT,RP...>::traits SrcTraits ; - typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ; + typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ; static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" ); Mapping::assign( *this , rhs ); return *this ; @@ -870,7 +879,7 @@ public: ) : m_track() , m_map() - , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) ) + , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) ) { // Append layout and spaces if not input typedef Impl::ViewCtorProp< P ... > alloc_prop_input ; @@ -923,7 +932,7 @@ public: //------------------------------------------------------------ Kokkos::Experimental::Impl::SharedAllocationRecord<> * - record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) ); + record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) ); //------------------------------------------------------------ #if defined( KOKKOS_ENABLE_CUDA ) @@ -947,8 +956,8 @@ public: >::type const & arg_layout ) : m_track() // No memory tracking - , m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) ) - , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) ) + , m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) ) + , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) ) { static_assert( std::is_same< pointer_type @@ -1034,6 +1043,7 @@ public: {} // For backward compatibility + // NDE This ctor does not take ViewCtorProp argument - should not use alternative createLayout call explicit inline DynRankView( const ViewAllocateWithoutInitializing & arg_prop , const typename traits::array_layout & arg_layout @@ -1179,6 +1189,11 @@ namespace Impl { struct DynRankSubviewTag {}; +} // namespace Impl +} // namespace Experimental + +namespace Impl { + template< class SrcTraits , class ... Args > struct ViewMapping < typename std::enable_if<( @@ -1192,7 +1207,7 @@ struct ViewMapping std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value ) - ), DynRankSubviewTag >::type + ), Kokkos::Experimental::Impl::DynRankSubviewTag >::type , SrcTraits , Args ... > { @@ -1264,7 +1279,7 @@ public: }; - typedef DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type; + typedef Kokkos::Experimental::DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type; template < typename T , class ... P > KOKKOS_INLINE_FUNCTION @@ -1336,9 +1351,10 @@ public: } // end Impl +namespace Experimental { template< class V , class ... Args > -using Subdynrankview = typename Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ; +using Subdynrankview = typename Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ; template< class D , class ... P , class ...Args > KOKKOS_INLINE_FUNCTION @@ -1348,7 +1364,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args. if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args { Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); } - typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ; + typedef Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ; return metafcn::subview( src.rank() , src , args... ); } diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index da96db2d6b782f2ac2f2aada57f53346365ccedb..e9059d64c41e082357b3c8b21ed861412e5d18d6 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -57,7 +57,7 @@ namespace Experimental { */ template< typename DataType , typename ... P > class DynamicView : public Kokkos::ViewTraits< DataType , P ... > -{ +{ public: typedef Kokkos::ViewTraits< DataType , P ... > traits ; @@ -68,7 +68,7 @@ private: typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ; - static_assert( traits::rank == 1 && traits::rank_dynamic == 1 + static_assert( traits::rank == 1 && traits::rank_dynamic == 1 , "DynamicView must be rank-one" ); static_assert( std::is_trivial< typename traits::value_type >::value && @@ -216,14 +216,14 @@ public: // Verify that allocation of the requested chunk in in progress. // The allocated chunk counter is m_chunks[ m_chunk_max ] - const uintptr_t n = + const uintptr_t n = *reinterpret_cast<uintptr_t volatile *>( m_chunks + m_chunk_max ); if ( n <= ic ) { Kokkos::abort("Kokkos::DynamicView array bounds error"); } - // Allocation of this chunk is in progress + // Allocation of this chunk is in progress // so wait for allocation to complete. while ( 0 == *ch ); } @@ -267,7 +267,7 @@ public: const uintptr_t jc_try = jc ; // Jump iteration to the chunk counter. - + jc = atomic_compare_exchange( pc , jc_try , jc_try + 1 ); if ( jc_try == jc ) { @@ -316,7 +316,7 @@ public: } else { while ( NC + 1 <= *pc ) { - --*pc ; + --*pc ; m_pool.deallocate( m_chunks[*pc] , sizeof(value_type) << m_chunk_shift ); m_chunks[*pc] = 0 ; @@ -331,7 +331,7 @@ public: typename traits::value_type ** m_chunks ; uintptr_t * m_pc ; uintptr_t m_nc ; - unsigned m_chunk_shift ; + unsigned m_chunk_shift ; KOKKOS_INLINE_FUNCTION void operator()( int ) const @@ -348,7 +348,7 @@ public: } else { while ( m_nc + 1 <= *m_pc ) { - --*m_pc ; + --*m_pc ; m_pool.deallocate( m_chunks[*m_pc] , sizeof(value_type) << m_chunk_shift ); m_chunks[*m_pc] = 0 ; @@ -482,7 +482,7 @@ public: }; - /**\brief Allocation constructor + /**\brief Allocation constructor * * Memory is allocated in chunks from the memory pool. * The chunk size conforms to the memory pool's chunk size. @@ -557,7 +557,7 @@ void deep_copy( const View<T,DP...> & dst if ( DstExecCanAccessSrc ) { // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. - Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src ); + Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src ); } else { Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation"); @@ -581,7 +581,7 @@ void deep_copy( const DynamicView<T,DP...> & dst if ( DstExecCanAccessSrc ) { // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. - Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src ); + Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src ); } else { Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation"); diff --git a/lib/kokkos/containers/unit_tests/TestCuda.cpp b/lib/kokkos/containers/unit_tests/TestCuda.cpp index 5a78a5de9e228fe26a113e2af40876d8f05d4464..651a4e7eb8156eb19544d4c3124ad021921e722a 100644 --- a/lib/kokkos/containers/unit_tests/TestCuda.cpp +++ b/lib/kokkos/containers/unit_tests/TestCuda.cpp @@ -69,6 +69,8 @@ #include <Kokkos_ErrorReporter.hpp> #include <TestErrorReporter.hpp> +#include <TestViewCtorPropEmbeddedDim.hpp> + //---------------------------------------------------------------------------- @@ -94,6 +96,10 @@ TEST_F( cuda , dyn_view_api) { TestDynViewAPI< double , Kokkos::Cuda >(); } +TEST_F( cuda, viewctorprop_embedded_dim ) { + TestViewCtorProp_EmbeddedDim< Kokkos::Cuda >::test_vcpt( 2, 3 ); +} + TEST_F( cuda , staticcrsgraph ) { TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >(); diff --git a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp index 2448bd077b3f06c04144b264a661a384656552db..5365d913619817e21e9e4a4fc3357394be6d8047 100644 --- a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp +++ b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp @@ -66,6 +66,8 @@ #include <Kokkos_ErrorReporter.hpp> #include <TestErrorReporter.hpp> +#include <TestViewCtorPropEmbeddedDim.hpp> + #include <iomanip> namespace Test { @@ -76,14 +78,7 @@ protected: { std::cout << std::setprecision(5) << std::scientific; - unsigned threads_count = 4 ; - - if ( Kokkos::hwloc::available() ) { - threads_count = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa(); - } - - Kokkos::OpenMP::initialize( threads_count ); + Kokkos::OpenMP::initialize(); } static void TearDownTestCase() @@ -96,6 +91,10 @@ TEST_F( openmp, dyn_view_api) { TestDynViewAPI< double , Kokkos::OpenMP >(); } +TEST_F( openmp, viewctorprop_embedded_dim ) { + TestViewCtorProp_EmbeddedDim< Kokkos::OpenMP >::test_vcpt( 2, 3 ); +} + TEST_F( openmp, bitset ) { test_bitset<Kokkos::OpenMP>(); diff --git a/lib/kokkos/containers/unit_tests/TestSerial.cpp b/lib/kokkos/containers/unit_tests/TestSerial.cpp index 06c4d9f6ed0f4d97abb3828c3983058ea09c497b..1b9b5a2da3b498a9f9c1537bc8d86295cbf9985c 100644 --- a/lib/kokkos/containers/unit_tests/TestSerial.cpp +++ b/lib/kokkos/containers/unit_tests/TestSerial.cpp @@ -67,6 +67,8 @@ #include <Kokkos_ErrorReporter.hpp> #include <TestErrorReporter.hpp> +#include <TestViewCtorPropEmbeddedDim.hpp> + namespace Test { class serial : public ::testing::Test { @@ -85,6 +87,10 @@ TEST_F( serial, dyn_view_api) { TestDynViewAPI< double , Kokkos::Serial >(); } +TEST_F( serial, viewctorprop_embedded_dim ) { + TestViewCtorProp_EmbeddedDim< Kokkos::Serial >::test_vcpt( 2, 3 ); +} + TEST_F( serial , staticcrsgraph ) { TestStaticCrsGraph::run_test_graph< Kokkos::Serial >(); diff --git a/lib/kokkos/containers/unit_tests/TestThreads.cpp b/lib/kokkos/containers/unit_tests/TestThreads.cpp index 938ec88e90f7924c61d20888a4cbc8dcddfef4bf..aca0b57d6505cdf9562ad3b05526cfdeaed4b2a9 100644 --- a/lib/kokkos/containers/unit_tests/TestThreads.cpp +++ b/lib/kokkos/containers/unit_tests/TestThreads.cpp @@ -70,6 +70,8 @@ #include <Kokkos_ErrorReporter.hpp> #include <TestErrorReporter.hpp> +#include <TestViewCtorPropEmbeddedDim.hpp> + namespace Test { class threads : public ::testing::Test { @@ -103,6 +105,10 @@ TEST_F( threads , dyn_view_api) { TestDynViewAPI< double , Kokkos::Threads >(); } +TEST_F( threads, viewctorprop_embedded_dim ) { + TestViewCtorProp_EmbeddedDim< Kokkos::Threads >::test_vcpt( 2, 3 ); +} + TEST_F( threads , staticcrsgraph ) { TestStaticCrsGraph::run_test_graph< Kokkos::Threads >(); diff --git a/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1efd1ddc517b51af09f8d78046b612e708172f70 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp @@ -0,0 +1,213 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <Kokkos_DynRankView.hpp> + +#include <type_traits> +#include <typeinfo> + +namespace Test { + +namespace { + +template <typename ExecSpace > +struct TestViewCtorProp_EmbeddedDim { + + using ViewIntType = typename Kokkos::View< int**, ExecSpace >; + using ViewDoubleType = typename Kokkos::View< double*, ExecSpace >; + + using DynRankViewIntType = typename Kokkos::DynRankView< int, ExecSpace >; + using DynRankViewDoubleType = typename Kokkos::DynRankView< double, ExecSpace >; + + // Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor + template < class ViewType > + struct Functor { + + ViewType v; + + Functor( const ViewType & v_ ) : v(v_) {} + + KOKKOS_INLINE_FUNCTION + void operator()( const int i ) const { + v(i) = i; + } + + }; + + + static void test_vcpt( const int N0, const int N1 ) + { + + // Create two views to test + { + using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ; + using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ; + + VIT vi1("vi1", N0, N1); + VDT vd1("vd1", N0); + + // TEST: Test for common type between two views, one with type double, other with type int + // Deduce common value_type and construct a view with that type + { + // Two views + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1); + typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType; + typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT; + typedef typename CVT::HostMirror HostCVT; + + // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg + CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 ); + + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), + Functor<CVT>(cv1) + ); + + HostCVT hcv1 = Kokkos::create_mirror_view( cv1 ); + Kokkos::deep_copy( hcv1, cv1 ); + + ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ; + #if 0 + // debug output + for ( int i = 0; i < N0*N1; ++i ) { + printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) ); + } + + printf( " Common value type view: %s \n", typeid( CVT() ).name() ); + printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); + if ( std::is_same< CommonViewValueType, double >::value == true ) { + printf("Proper common value_type\n"); + } + else { + printf("WRONG common value_type\n"); + } + // end debug output + #endif + } + + { + // Single view + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1); + typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType; + typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT; + typedef typename CVT::HostMirror HostCVT; + + // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg + CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 ); + + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), + Functor<CVT>(cv1) + ); + + HostCVT hcv1 = Kokkos::create_mirror_view( cv1 ); + Kokkos::deep_copy( hcv1, cv1 ); + + ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ; + } + + } + + // Create two dynamic rank views to test + { + using VIT = typename TestViewCtorProp_EmbeddedDim::DynRankViewIntType ; + using VDT = typename TestViewCtorProp_EmbeddedDim::DynRankViewDoubleType ; + + VIT vi1("vi1", N0, N1); + VDT vd1("vd1", N0); + + // TEST: Test for common type between two views, one with type double, other with type int + // Deduce common value_type and construct a view with that type + { + // Two views + auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1, vd1 ); + typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType; + typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT; + typedef typename CVT::HostMirror HostCVT; + + // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg + CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 ); + + + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), + Functor<CVT>(cv1) + ); + + HostCVT hcv1 = Kokkos::create_mirror_view( cv1 ); + Kokkos::deep_copy( hcv1, cv1 ); + + ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ; + } + + { + // Single views + auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1 ); + typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType; + typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT; + typedef typename CVT::HostMirror HostCVT; + + // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg + CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 ); + + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), + Functor<CVT>(cv1) + ); + + HostCVT hcv1 = Kokkos::create_mirror_view( cv1 ); + Kokkos::deep_copy( hcv1, cv1 ); + + ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ; + } + } + + + } // end test_vcpt + +}; // end struct + +} // namespace + +} // namespace Test diff --git a/lib/kokkos/containers/unit_tests/UnitTestMain.cpp b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp index f952ab3db51028aff0a0ebfe313b2639e353ab87..2b73535c833bd5a3caa4b530a6f406fe40b710ec 100644 --- a/lib/kokkos/containers/unit_tests/UnitTestMain.cpp +++ b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,12 +36,14 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ #include <gtest/gtest.h> +#include <cstdlib> +#include <Kokkos_Macros.hpp> int main(int argc, char *argv[]) { ::testing::InitGoogleTest(&argc,argv); diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile index f59e7bbe1c1c377f8d23aa2760323d48bbc6bafe..bb9353f5834e547cb4eee1326bf4115964a5deb1 100644 --- a/lib/kokkos/core/perf_test/Makefile +++ b/lib/kokkos/core/perf_test/Makefile @@ -79,7 +79,6 @@ test-mempool: KokkosCore_PerformanceTest_Mempool test-taskdag: KokkosCore_PerformanceTest_TaskDAG ./KokkosCore_PerformanceTest_TaskDAG - build_all: $(TARGETS) test: $(TEST_TARGETS) diff --git a/lib/kokkos/core/perf_test/PerfTestMain.cpp b/lib/kokkos/core/perf_test/PerfTestMain.cpp index d80cfab8b58b87825f2b114b6bf7aed909555ad7..832f650b9a8d7b4ae189551389dce0e1de537a6d 100644 --- a/lib/kokkos/core/perf_test/PerfTestMain.cpp +++ b/lib/kokkos/core/perf_test/PerfTestMain.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,12 +36,14 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ #include <gtest/gtest.h> +#include <cstdlib> + #include <Kokkos_Core.hpp> namespace Test { diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp new file mode 100644 index 0000000000000000000000000000000000000000..46321378d9b4003c14c0165e0ef077e693a0b26a --- /dev/null +++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp @@ -0,0 +1,2715 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP +#define KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP + +#include <Kokkos_Macros.hpp> +#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA ) + +#include <iostream> +#include <algorithm> +#include <cstdio> + +#include <utility> + +// #include<Cuda/Kokkos_CudaExec.hpp> +// Including the file above leads to following type of errors: +// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete type is not allowed +// use existing Kokkos functionality, e.g. max blocks, once resolved + +#if defined(KOKKOS_ENABLE_PROFILING) +#include <impl/Kokkos_Profiling_Interface.hpp> +#include <typeinfo> +#endif + +namespace Kokkos { namespace Experimental { namespace Impl { + +namespace Refactor { + +// ------------------------------------------------------------------ // +// ParallelFor iteration pattern +template< int N , typename RP , typename Functor , typename Tag > +struct DeviceIterateTile; + +//Rank 2 +// Specializations for void tag type +template< typename RP , typename Functor > +struct DeviceIterateTile<2,RP,Functor,void > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ ) + : m_rp(rp_) + , m_func(f_) + {} + + inline __device__ + void exec_range() const + { + // LL + if (RP::inner_direction == RP::Left) { + for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { + + for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { + m_func(offset_0 , offset_1); + } + } + } + } + } + // LR + else { + for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { + + for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { + m_func(offset_0 , offset_1); + } + } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; +}; + +// Specializations for tag type +template< typename RP , typename Functor , typename Tag > +struct DeviceIterateTile<2,RP,Functor,Tag> +{ + using index_type = typename RP::index_type; + + inline __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ ) + : m_rp(rp_) + , m_func(f_) + {} + + inline __device__ + void exec_range() const + { + if (RP::inner_direction == RP::Left) { + // Loop over size maxnumblocks until full range covered + for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { + + for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { + m_func(Tag(), offset_0 , offset_1); + } + } + } + } + } + else { + for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { + + for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { + m_func(Tag(), offset_0 , offset_1); + } + } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; +}; + + +//Rank 3 +// Specializations for void tag type +template< typename RP , typename Functor > +struct DeviceIterateTile<3,RP,Functor,void > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ ) + : m_rp(rp_) + , m_func(f_) + {} + + inline __device__ + void exec_range() const + { + // LL + if (RP::inner_direction == RP::Left) { + for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { + + for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { + + for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { + m_func(offset_0 , offset_1 , offset_2); + } + } + } + } + } + } + } + // LR + else { + for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { + + for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { + + for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { + m_func(offset_0 , offset_1 , offset_2); + } + } + } + } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; +}; + +// Specializations for void tag type +template< typename RP , typename Functor , typename Tag > +struct DeviceIterateTile<3,RP,Functor,Tag> +{ + using index_type = typename RP::index_type; + + inline __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ ) + : m_rp(rp_) + , m_func(f_) + {} + + inline __device__ + void exec_range() const + { + if (RP::inner_direction == RP::Left) { + for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { + + for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { + + for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { + m_func(Tag(), offset_0 , offset_1 , offset_2); + } + } + } + } + } + } + } + else { + for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { + + for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { + + for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { + m_func(Tag(), offset_0 , offset_1 , offset_2); + } + } + } + } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; +}; + + +//Rank 4 +// Specializations for void tag type +template< typename RP , typename Functor > +struct DeviceIterateTile<4,RP,Functor,void > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ ) + : m_rp(rp_) + , m_func(f_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + // LL + if (RP::inner_direction == RP::Left) { + const index_type temp0 = m_rp.m_tile_end[0]; + const index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x % numbl0; + const index_type tile_id1 = (index_type)blockIdx.x / numbl0; + const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; + const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; + + for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { + + for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { + + for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + m_func(offset_0 , offset_1 , offset_2 , offset_3); + } + } + } + } + } + } + } + } + } + // LR + else { + const index_type temp0 = m_rp.m_tile_end[0]; + const index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x / numbl1; + const index_type tile_id1 = (index_type)blockIdx.x % numbl1; + const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; + const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; + + for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + + for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { + + for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { + m_func(offset_0 , offset_1 , offset_2 , offset_3); + } + } + } + } + } + } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; +}; + +// Specializations for void tag type +template< typename RP , typename Functor , typename Tag > +struct DeviceIterateTile<4,RP,Functor,Tag> +{ + using index_type = typename RP::index_type; + + inline __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ ) + : m_rp(rp_) + , m_func(f_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if (RP::inner_direction == RP::Left) { + const index_type temp0 = m_rp.m_tile_end[0]; + const index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x % numbl0; + const index_type tile_id1 = (index_type)blockIdx.x / numbl0; + const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; + const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; + + for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { + + for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { + + for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3); + } + } + } + } + } + } + } + } + } + else { + const index_type temp0 = m_rp.m_tile_end[0]; + const index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x / numbl1; + const index_type tile_id1 = (index_type)blockIdx.x % numbl1; + const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; + const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; + + for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + + for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { + + for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { + m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3); + } + } + } + } + } + } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; +}; + + +//Rank 5 +// Specializations for void tag type +template< typename RP , typename Functor > +struct DeviceIterateTile<5,RP,Functor,void > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ ) + : m_rp(rp_) + , m_func(f_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + // LL + if (RP::inner_direction == RP::Left) { + + index_type temp0 = m_rp.m_tile_end[0]; + index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x % numbl0; + const index_type tile_id1 = (index_type)blockIdx.x / numbl0; + const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; + const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; + + temp0 = m_rp.m_tile_end[2]; + temp1 = m_rp.m_tile_end[3]; + const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id2 = (index_type)blockIdx.y % numbl2; + const index_type tile_id3 = (index_type)blockIdx.y / numbl2; + const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; + const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; + + for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { + + for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { + + for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { + + for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4); + } + } + } + } + } + } + } + } + } + } + } + // LR + else { + index_type temp0 = m_rp.m_tile_end[0]; + index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x / numbl1; + const index_type tile_id1 = (index_type)blockIdx.x % numbl1; + const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; + const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; + + temp0 = m_rp.m_tile_end[2]; + temp1 = m_rp.m_tile_end[3]; + const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id2 = (index_type)blockIdx.y / numbl3; + const index_type tile_id3 = (index_type)blockIdx.y % numbl3; + const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; + const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; + + for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + + for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { + + for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { + + for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { + m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4); + } + } + } + } + } + } + } + } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; +}; + +// Specializations for tag type +template< typename RP , typename Functor , typename Tag > +struct DeviceIterateTile<5,RP,Functor,Tag> +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ ) + : m_rp(rp_) + , m_func(f_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + // LL + if (RP::inner_direction == RP::Left) { + index_type temp0 = m_rp.m_tile_end[0]; + index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x % numbl0; + const index_type tile_id1 = (index_type)blockIdx.x / numbl0; + const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; + const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; + + temp0 = m_rp.m_tile_end[2]; + temp1 = m_rp.m_tile_end[3]; + const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id2 = (index_type)blockIdx.y % numbl2; + const index_type tile_id3 = (index_type)blockIdx.y / numbl2; + const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; + const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; + + for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { + + for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { + + for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { + + for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4); + } + } + } + } + } + } + } + } + } + } + } + // LR + else { + index_type temp0 = m_rp.m_tile_end[0]; + index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x / numbl1; + const index_type tile_id1 = (index_type)blockIdx.x % numbl1; + const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; + const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; + + temp0 = m_rp.m_tile_end[2]; + temp1 = m_rp.m_tile_end[3]; + const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id2 = (index_type)blockIdx.y / numbl3; + const index_type tile_id3 = (index_type)blockIdx.y % numbl3; + const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; + const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; + + for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + + for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { + + for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { + + for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { + m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4); + } + } + } + } + } + } + } + } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; +}; + + +//Rank 6 +// Specializations for void tag type +template< typename RP , typename Functor > +struct DeviceIterateTile<6,RP,Functor,void > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ ) + : m_rp(rp_) + , m_func(f_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + // LL + if (RP::inner_direction == RP::Left) { + index_type temp0 = m_rp.m_tile_end[0]; + index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x % numbl0; + const index_type tile_id1 = (index_type)blockIdx.x / numbl0; + const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; + const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; + + temp0 = m_rp.m_tile_end[2]; + temp1 = m_rp.m_tile_end[3]; + const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id2 = (index_type)blockIdx.y % numbl2; + const index_type tile_id3 = (index_type)blockIdx.y / numbl2; + const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; + const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; + + temp0 = m_rp.m_tile_end[4]; + temp1 = m_rp.m_tile_end[5]; + const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id4 = (index_type)blockIdx.z % numbl4; + const index_type tile_id5 = (index_type)blockIdx.z / numbl4; + const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4]; + const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; + + for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { + + for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { + + for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { + + for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { + + for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); + } + } + } + } + } + } + } + } + } + } + } + } + } + // LR + else { + index_type temp0 = m_rp.m_tile_end[0]; + index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x / numbl1; + const index_type tile_id1 = (index_type)blockIdx.x % numbl1; + const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; + const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; + + temp0 = m_rp.m_tile_end[2]; + temp1 = m_rp.m_tile_end[3]; + const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id2 = (index_type)blockIdx.y / numbl3; + const index_type tile_id3 = (index_type)blockIdx.y % numbl3; + const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; + const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; + + temp0 = m_rp.m_tile_end[4]; + temp1 = m_rp.m_tile_end[5]; + const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id4 = (index_type)blockIdx.z / numbl5; + const index_type tile_id5 = (index_type)blockIdx.z % numbl5; + const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5]; + const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; + + for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + + for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { + + for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { + + for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { + + for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { + m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); + } + } + } + } + } + } + } + } + } + } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; +}; + +// Specializations for tag type +template< typename RP , typename Functor , typename Tag > +struct DeviceIterateTile<6,RP,Functor,Tag> +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ ) + : m_rp(rp_) + , m_func(f_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + // LL + if (RP::inner_direction == RP::Left) { + index_type temp0 = m_rp.m_tile_end[0]; + index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x % numbl0; + const index_type tile_id1 = (index_type)blockIdx.x / numbl0; + const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; + const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; + + temp0 = m_rp.m_tile_end[2]; + temp1 = m_rp.m_tile_end[3]; + const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id2 = (index_type)blockIdx.y % numbl2; + const index_type tile_id3 = (index_type)blockIdx.y / numbl2; + const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; + const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; + + temp0 = m_rp.m_tile_end[4]; + temp1 = m_rp.m_tile_end[5]; + const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ; + const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) : + ( temp1 <= max_blocks ? temp1 : max_blocks ) ); + + const index_type tile_id4 = (index_type)blockIdx.z % numbl4; + const index_type tile_id5 = (index_type)blockIdx.z / numbl4; + const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4]; + const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; + + for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { + + for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { + + for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { + + for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { + + for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); + } + } + } + } + } + } + } + } + } + } + } + } + } + // LR + else { + index_type temp0 = m_rp.m_tile_end[0]; + index_type temp1 = m_rp.m_tile_end[1]; + const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id0 = (index_type)blockIdx.x / numbl1; + const index_type tile_id1 = (index_type)blockIdx.x % numbl1; + const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; + const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; + + temp0 = m_rp.m_tile_end[2]; + temp1 = m_rp.m_tile_end[3]; + const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id2 = (index_type)blockIdx.y / numbl3; + const index_type tile_id3 = (index_type)blockIdx.y % numbl3; + const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; + const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; + + temp0 = m_rp.m_tile_end[4]; + temp1 = m_rp.m_tile_end[5]; + const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ; + const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) : + ( temp0 <= max_blocks ? temp0 : max_blocks ) ); + + const index_type tile_id4 = (index_type)blockIdx.z / numbl5; + const index_type tile_id5 = (index_type)blockIdx.z % numbl5; + const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5]; + const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; + + for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { + + for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { + + for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { + + for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { + + for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { + + for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { + m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); + } + } + } + } + } + } + } + } + } + } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; +}; + +} // Refactor + +// ---------------------------------------------------------------------------------- + +namespace Reduce { + +template < typename T > +using is_void = std::is_same< T, void >; + +template < typename T > +struct is_array_type : std::false_type +{ + using value_type = T; +}; + +template < typename T > +struct is_array_type< T* > : std::true_type +{ + using value_type = T; +}; + +template < typename T > +struct is_array_type< T[] > : std::true_type +{ + using value_type = T; +}; + +// ------------------------------------------------------------------ // +template< int N , typename RP , typename Functor , typename Tag , typename ValueType , typename Enable = void > +struct DeviceIterateTile; + +// ParallelReduce iteration pattern +// Scalar reductions + +// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of tiles and reduction algorithm constraints +// extract n-dim tile offsets (i.e. tile's global starting mulit-index) from the tileid = blockid using tile dimensions +// local indices within a tile extracted from (index_type)threadIdx.x using tile dims, constrained by blocksize +// combine tile and local id info for multi-dim global ids + +// Pattern: +// Each block+thread is responsible for a tile+local_id combo (additional when striding by num_blocks) +// 1. create offset arrays +// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max num blocks) +// 3. temps set for tile_idx and thrd_idx, which will be modified +// 4. if LL vs LR: +// determine tile starting point offsets (multidim) +// determine local index offsets (multidim) +// concatentate tile offset + local offset for global multi-dim index +// if offset withinin range bounds AND local offset within tile bounds, call functor + +// ValueType = T +//Rank 2 +// Specializations for void tag type +template< typename RP , typename Functor , typename ValueType > +struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< !is_array_type<ValueType>::value >::type > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + inline __device__ + void exec_range() const + { + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + // Deduce this blocks tile_id + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { + m_func( m_offset[0], m_offset[1], m_v ); + } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_v ); } + } + } + } + + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + ValueType & m_v; +}; + + +// Specializations for tag type +template< typename RP , typename Functor , typename Tag, typename ValueType > +struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type > +{ + using index_type = typename RP::index_type; + + inline __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + inline __device__ + void exec_range() const + { + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + ValueType & m_v; +}; + + +//Rank 3 +// Specializations for void tag type +template< typename RP , typename Functor , typename ValueType > +struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + inline __device__ + void exec_range() const + { + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + ValueType & m_v; +}; + + +// Specializations for void tag type +template< typename RP , typename Functor , typename Tag, typename ValueType > +struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type > +{ + using index_type = typename RP::index_type; + + inline __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + inline __device__ + void exec_range() const + { + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + ValueType & m_v; +}; + + +//Rank 4 +// Specializations for void tag type +template< typename RP , typename Functor , typename ValueType > +struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + ValueType & m_v; +}; + + +// Specializations for void tag type +template< typename RP , typename Functor , typename Tag, typename ValueType > +struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type > +{ + using index_type = typename RP::index_type; + + inline __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + ValueType & m_v; +}; + + +//Rank 5 +// Specializations for void tag type +template< typename RP , typename Functor , typename ValueType > +struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + ValueType & m_v; +}; + + +// Specializations for tag type +template< typename RP , typename Functor , typename Tag, typename ValueType > +struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + ValueType & m_v; +}; + + +//Rank 6 +// Specializations for void tag type +template< typename RP , typename Functor , typename ValueType > +struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + ValueType & m_v; +}; + + +// Specializations for tag type +template< typename RP , typename Functor , typename Tag, typename ValueType > +struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type > +{ + using index_type = typename RP::index_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + ValueType & m_v; +}; + + +// ValueType = T[], T* +//Rank 2 +// Specializations for void tag type +template< typename RP , typename Functor , typename ValueType > +struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< is_array_type<ValueType>::value >::type > +{ + using index_type = typename RP::index_type; + using value_type = typename is_array_type< ValueType >::value_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + inline __device__ + void exec_range() const + { + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { + m_func( m_offset[0], m_offset[1], m_v ); + } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + value_type* m_v; +}; + + +// Specializations for tag type +template< typename RP , typename Functor , typename Tag, typename ValueType > +struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type > +{ + using index_type = typename RP::index_type; + using value_type = typename is_array_type< ValueType >::value_type; + + inline __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + inline __device__ + void exec_range() const + { + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_v ); } + } + } //end for loop over num_tiles - product of tiles in each direction + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + value_type* m_v; +}; + + +//Rank 3 +// Specializations for void tag type +template< typename RP , typename Functor , typename ValueType > +struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type > +{ + using index_type = typename RP::index_type; + using value_type = typename is_array_type< ValueType >::value_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + inline __device__ + void exec_range() const + { + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + value_type* m_v; +}; + + +// Specializations for void tag type +template< typename RP , typename Functor , typename Tag, typename ValueType > +struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type > +{ + using index_type = typename RP::index_type; + using value_type = typename is_array_type< ValueType >::value_type; + + inline __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + inline __device__ + void exec_range() const + { + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + value_type* m_v; +}; + + +//Rank 4 +// Specializations for void tag type +template< typename RP , typename Functor , typename ValueType > +struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type > +{ + using index_type = typename RP::index_type; + using value_type = typename is_array_type< ValueType >::value_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + value_type* m_v; +}; + + +// Specializations for void tag type +template< typename RP , typename Functor , typename Tag, typename ValueType > +struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type > +{ + using index_type = typename RP::index_type; + using value_type = typename is_array_type< ValueType >::value_type; + + inline __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + value_type* m_v; +}; + + +//Rank 5 +// Specializations for void tag type +template< typename RP , typename Functor , typename ValueType > +struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type > +{ + using index_type = typename RP::index_type; + using value_type = typename is_array_type< ValueType >::value_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + value_type* m_v; +}; + + +// Specializations for tag type +template< typename RP , typename Functor , typename Tag, typename ValueType > +struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type > +{ + using index_type = typename RP::index_type; + using value_type = typename is_array_type< ValueType >::value_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + value_type* m_v; +}; + + +//Rank 6 +// Specializations for void tag type +template< typename RP , typename Functor , typename ValueType > +struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type > +{ + using index_type = typename RP::index_type; + using value_type = typename is_array_type< ValueType >::value_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + value_type* m_v; +}; + + +// Specializations for tag type +template< typename RP , typename Functor , typename Tag, typename ValueType > +struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type > +{ + using index_type = typename RP::index_type; + using value_type = typename is_array_type< ValueType >::value_type; + + __device__ + DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_) + : m_rp(rp_) + , m_func(f_) + , m_v(v_) + {} + + static constexpr index_type max_blocks = 65535; + //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + + inline __device__ + void exec_range() const + { + //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; + //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() ); + if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) { + index_type m_offset[RP::rank]; // tile starting global id offset + index_type m_local_offset[RP::rank]; // tile starting global id offset + + for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) { + index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets + index_type thrd_idx = (index_type)threadIdx.y; + bool in_bounds = true; + + // LL + if (RP::inner_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); } + } + // LR + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx.y + m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); + thrd_idx /= m_rp.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) { + in_bounds &= false; + } + } + if ( in_bounds ) + { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); } + } + } + } + } //end exec_range + +private: + const RP & m_rp; + const Functor & m_func; + value_type* m_v; +}; + +} // Reduce + +// ---------------------------------------------------------------------------------- + +} } } //end namespace Kokkos::Experimental::Impl + +#endif +#endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp index 13abcfd93c241f7601a58c3003b54dd5b281936b..cae8ecd489f7917fd3ccc1c0f6628000f6351773 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp @@ -53,6 +53,7 @@ #include <impl/Kokkos_Error.hpp> #include <Cuda/Kokkos_Cuda_abort.hpp> #include <Cuda/Kokkos_Cuda_Error.hpp> +#include <Cuda/Kokkos_Cuda_Locks.hpp> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -125,53 +126,12 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits: #endif - -namespace Kokkos { -namespace Impl { - struct CudaLockArraysStruct { - int* atomic; - int* scratch; - int* threadid; - int n; - }; -} -} -__device__ __constant__ -#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -extern -#endif -Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ; - -#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF -#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39 - namespace Kokkos { namespace Impl { void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false); } } -namespace Kokkos { -namespace Impl { -__device__ inline -bool lock_address_cuda_space(void* ptr) { - size_t offset = size_t(ptr); - offset = offset >> 2; - offset = offset & CUDA_SPACE_ATOMIC_MASK; - return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1)); -} - -__device__ inline -void unlock_address_cuda_space(void* ptr) { - size_t offset = size_t(ptr); - offset = offset >> 2; - offset = offset & CUDA_SPACE_ATOMIC_MASK; - atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0); -} - -} -} - template< typename T > inline __device__ @@ -192,8 +152,19 @@ namespace Impl { // For 2.0 capability: 48 KB L1 and 16 KB shared //---------------------------------------------------------------------------- -template< class DriverType > +template< class DriverType> +__global__ +static void cuda_parallel_launch_constant_memory() +{ + const DriverType & driver = + *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer ); + + driver(); +} + +template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > __global__ +__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_constant_memory() { const DriverType & driver = @@ -202,19 +173,28 @@ static void cuda_parallel_launch_constant_memory() driver(); } -template< class DriverType > +template< class DriverType> __global__ static void cuda_parallel_launch_local_memory( const DriverType driver ) { driver(); } -template < class DriverType , - bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) > +template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > +__global__ +__launch_bounds__(maxTperB, minBperSM) +static void cuda_parallel_launch_local_memory( const DriverType driver ) +{ + driver(); +} + +template < class DriverType + , class LaunchBounds = Kokkos::LaunchBounds<> + , bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) > struct CudaParallelLaunch ; -template < class DriverType > -struct CudaParallelLaunch< DriverType , true > { +template < class DriverType, class LaunchBounds > +struct CudaParallelLaunch< DriverType, LaunchBounds, true > { inline CudaParallelLaunch( const DriverType & driver @@ -238,26 +218,19 @@ struct CudaParallelLaunch< DriverType , true > { } #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads else if ( shmem ) { - CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ) ); + CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) ); } else { - CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ) ); + CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) ); } #endif // Copy functor to constant memory on the device cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) ); - #ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - Kokkos::Impl::CudaLockArraysStruct locks; - locks.atomic = atomic_lock_array_cuda_space_ptr(false); - locks.scratch = scratch_lock_array_cuda_space_ptr(false); - locks.threadid = threadid_lock_array_cuda_space_ptr(false); - locks.n = Kokkos::Cuda::concurrency(); - cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) ); - #endif + KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); // Invoke the driver function on the device - cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>(); + cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>(); #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) CUDA_SAFE_CALL( cudaGetLastError() ); @@ -267,8 +240,8 @@ struct CudaParallelLaunch< DriverType , true > { } }; -template < class DriverType > -struct CudaParallelLaunch< DriverType , false > { +template < class DriverType, class LaunchBounds > +struct CudaParallelLaunch< DriverType, LaunchBounds, false > { inline CudaParallelLaunch( const DriverType & driver @@ -284,22 +257,15 @@ struct CudaParallelLaunch< DriverType , false > { } #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads else if ( shmem ) { - CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ) ); + CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) ); } else { - CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ) ); + CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) ); } #endif - #ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - Kokkos::Impl::CudaLockArraysStruct locks; - locks.atomic = atomic_lock_array_cuda_space_ptr(false); - locks.scratch = scratch_lock_array_cuda_space_ptr(false); - locks.threadid = threadid_lock_array_cuda_space_ptr(false); - locks.n = Kokkos::Cuda::concurrency(); - cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) ); - #endif + KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver ); + cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver ); #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) CUDA_SAFE_CALL( cudaGetLastError() ); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 406b4f1e228065cffe087e61765ec38c5278ff23..b699f0d6baa2e0304a0c69808cedbe972f3b9e82 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -230,18 +230,6 @@ void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t } catch(...) {} } -constexpr const char* CudaSpace::name() { - return m_name; -} - -constexpr const char* CudaUVMSpace::name() { - return m_name; -} - -constexpr const char* CudaHostPinnedSpace::name() { - return m_name; -} - } // namespace Kokkos //---------------------------------------------------------------------------- @@ -655,11 +643,12 @@ reallocate_tracked( void * const arg_alloc_ptr SharedAllocationRecord< Kokkos::CudaSpace , void > * SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr ) { - using Header = SharedAllocationHeader ; using RecordBase = SharedAllocationRecord< void , void > ; using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ; #if 0 + using Header = SharedAllocationHeader ; + // Copy the header from the allocation Header head ; @@ -812,83 +801,6 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail ); } -} // namespace Impl -} // namespace Kokkos - -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace { - __global__ void init_lock_array_kernel_atomic() { - unsigned i = blockIdx.x*blockDim.x + threadIdx.x; - - if(i<CUDA_SPACE_ATOMIC_MASK+1) - kokkos_impl_cuda_lock_arrays.atomic[i] = 0; - } - - __global__ void init_lock_array_kernel_scratch_threadid(int N) { - unsigned i = blockIdx.x*blockDim.x + threadIdx.x; - - if(i<N) { - kokkos_impl_cuda_lock_arrays.scratch[i] = 0; - kokkos_impl_cuda_lock_arrays.threadid[i] = 0; - } - } -} - - -namespace Impl { -int* atomic_lock_array_cuda_space_ptr(bool deallocate) { - static int* ptr = NULL; - if(deallocate) { - cudaFree(ptr); - ptr = NULL; - } - - if(ptr==NULL && !deallocate) - cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1)); - return ptr; -} - -int* scratch_lock_array_cuda_space_ptr(bool deallocate) { - static int* ptr = NULL; - if(deallocate) { - cudaFree(ptr); - ptr = NULL; - } - - if(ptr==NULL && !deallocate) - cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency())); - return ptr; -} - -int* threadid_lock_array_cuda_space_ptr(bool deallocate) { - static int* ptr = NULL; - if(deallocate) { - cudaFree(ptr); - ptr = NULL; - } - - if(ptr==NULL && !deallocate) - cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency())); - return ptr; -} - -void init_lock_arrays_cuda_space() { - static int is_initialized = 0; - if(! is_initialized) { - Kokkos::Impl::CudaLockArraysStruct locks; - locks.atomic = atomic_lock_array_cuda_space_ptr(false); - locks.scratch = scratch_lock_array_cuda_space_ptr(false); - locks.threadid = threadid_lock_array_cuda_space_ptr(false); - locks.n = Kokkos::Cuda::concurrency(); - cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) ); - init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>(); - init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency()); - } -} - void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) { static void* ptr = NULL; static std::int64_t current_size = 0; @@ -908,8 +820,8 @@ void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) { return ptr; } -} -} +} // namespace Impl +} // namespace Kokkos #else void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {} #endif // KOKKOS_ENABLE_CUDA diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp index daf55cbd97b9931364f14a4f40b91a3e119bff00..80e8f9bd8a0c9e797a6c90bd3f5c427e24389b72 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp @@ -51,6 +51,7 @@ #include <Cuda/Kokkos_Cuda_Error.hpp> #include <Cuda/Kokkos_Cuda_Internal.hpp> +#include <Cuda/Kokkos_Cuda_Locks.hpp> #include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Profiling_Interface.hpp> @@ -69,9 +70,6 @@ __device__ __constant__ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ; -__device__ __constant__ -Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ; - #endif /*--------------------------------------------------------------------------*/ @@ -103,6 +101,7 @@ int cuda_kernel_arch() return arch ; } +#ifdef KOKKOS_ENABLE_CUDA_UVM bool cuda_launch_blocking() { const char * env = getenv("CUDA_LAUNCH_BLOCKING"); @@ -111,16 +110,13 @@ bool cuda_launch_blocking() return atoi(env); } +#endif } void cuda_device_synchronize() { -// static const bool launch_blocking = cuda_launch_blocking(); - -// if (!launch_blocking) { - CUDA_SAFE_CALL( cudaDeviceSynchronize() ); -// } + CUDA_SAFE_CALL( cudaDeviceSynchronize() ); } void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line ) @@ -240,6 +236,7 @@ public: unsigned m_maxWarpCount ; unsigned m_maxBlock ; unsigned m_maxSharedWords ; + uint32_t m_maxConcurrency ; size_type m_scratchSpaceCount ; size_type m_scratchFlagsCount ; size_type m_scratchUnifiedCount ; @@ -248,6 +245,7 @@ public: size_type * m_scratchSpace ; size_type * m_scratchFlags ; size_type * m_scratchUnified ; + uint32_t * m_scratchConcurrentBitset ; cudaStream_t * m_stream ; static int was_initialized; @@ -274,6 +272,7 @@ public: , m_maxWarpCount( 0 ) , m_maxBlock( 0 ) , m_maxSharedWords( 0 ) + , m_maxConcurrency( 0 ) , m_scratchSpaceCount( 0 ) , m_scratchFlagsCount( 0 ) , m_scratchUnifiedCount( 0 ) @@ -282,6 +281,7 @@ public: , m_scratchSpace( 0 ) , m_scratchFlags( 0 ) , m_scratchUnified( 0 ) + , m_scratchConcurrentBitset( 0 ) , m_stream( 0 ) {} @@ -327,7 +327,8 @@ CudaInternal::~CudaInternal() if ( m_stream || m_scratchSpace || m_scratchFlags || - m_scratchUnified ) { + m_scratchUnified || + m_scratchConcurrentBitset ) { std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" << std::endl ; std::cerr.flush(); @@ -339,6 +340,7 @@ CudaInternal::~CudaInternal() m_maxWarpCount = 0 ; m_maxBlock = 0 ; m_maxSharedWords = 0 ; + m_maxConcurrency = 0 ; m_scratchSpaceCount = 0 ; m_scratchFlagsCount = 0 ; m_scratchUnifiedCount = 0 ; @@ -347,6 +349,7 @@ CudaInternal::~CudaInternal() m_scratchSpace = 0 ; m_scratchFlags = 0 ; m_scratchUnified = 0 ; + m_scratchConcurrentBitset = 0 ; m_stream = 0 ; } @@ -485,6 +488,33 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count ) (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) ); } //---------------------------------- + // Concurrent bitset for obtaining unique tokens from within + // an executing kernel. + { + const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0 + + m_maxConcurrency = + max_threads_per_sm * cudaProp.multiProcessorCount ; + + const int32_t buffer_bound = + Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency ); + + // Allocate and initialize uint32_t[ buffer_bound ] + + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ; + + Record * const r = Record::allocate( Kokkos::CudaSpace() + , "InternalScratchBitset" + , sizeof(uint32_t) * buffer_bound ); + + Record::increment( r ); + + m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>( r->data() ); + + CUDA_SAFE_CALL( cudaMemset( m_scratchConcurrentBitset , 0 , sizeof(uint32_t) * buffer_bound ) ); + + } + //---------------------------------- if ( stream_count ) { m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) ); @@ -543,16 +573,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count ) cudaThreadSetCacheConfig(cudaFuncCachePreferShared); // Init the array for used for arbitrarily sized atomics - Impl::init_lock_arrays_cuda_space(); - - #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - Kokkos::Impl::CudaLockArraysStruct locks; - locks.atomic = atomic_lock_array_cuda_space_ptr(false); - locks.scratch = scratch_lock_array_cuda_space_ptr(false); - locks.threadid = threadid_lock_array_cuda_space_ptr(false); - locks.n = Kokkos::Cuda::concurrency(); - cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) ); - #endif + Impl::initialize_host_cuda_lock_arrays(); } //---------------------------------------------------------------------------- @@ -635,9 +656,7 @@ void CudaInternal::finalize() was_finalized = 1; if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) { - atomic_lock_array_cuda_space_ptr(true); - scratch_lock_array_cuda_space_ptr(true); - threadid_lock_array_cuda_space_ptr(true); + Impl::finalize_host_cuda_lock_arrays(); if ( m_stream ) { for ( size_type i = 1 ; i < m_streamCount ; ++i ) { @@ -653,6 +672,7 @@ void CudaInternal::finalize() RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) ); RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) ); RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) ); + RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) ); m_cudaDev = -1 ; m_multiProcCount = 0 ; @@ -666,6 +686,7 @@ void CudaInternal::finalize() m_scratchSpace = 0 ; m_scratchFlags = 0 ; m_scratchUnified = 0 ; + m_scratchConcurrentBitset = 0 ; m_stream = 0 ; } } @@ -713,9 +734,8 @@ namespace Kokkos { Cuda::size_type Cuda::detect_device_count() { return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; } -int Cuda::concurrency() { - return 131072; -} +int Cuda::concurrency() +{ return Impl::CudaInternal::singleton().m_maxConcurrency ; } int Cuda::is_initialized() { return Impl::CudaInternal::singleton().is_initialized(); } @@ -798,7 +818,22 @@ void Cuda::fence() const char* Cuda::name() { return "Cuda"; } } // namespace Kokkos + +namespace Kokkos { +namespace Experimental { + +UniqueToken< Kokkos::Cuda , Kokkos::Experimental::UniqueTokenScope::Global >:: +UniqueToken( Kokkos::Cuda const & ) + : m_buffer( Kokkos::Impl::CudaInternal::singleton().m_scratchConcurrentBitset ) + , m_count( Kokkos::Impl::CudaInternal::singleton().m_maxConcurrency ) + {} + +} // namespace Experimental +} // namespace Kokkos + #else + void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {} + #endif // KOKKOS_ENABLE_CUDA diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp new file mode 100644 index 0000000000000000000000000000000000000000..237022ad23d30aee694f53d8f35bd4f98be012b9 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp @@ -0,0 +1,119 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#ifdef KOKKOS_ENABLE_CUDA + +#include <Cuda/Kokkos_Cuda_Locks.hpp> +#include <Cuda/Kokkos_Cuda_Error.hpp> +#include <Kokkos_Cuda.hpp> + +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE +namespace Kokkos { +namespace Impl { +__device__ __constant__ +CudaLockArrays g_device_cuda_lock_arrays = { nullptr, nullptr, 0 }; +} +} +#endif + +namespace Kokkos { + +namespace { + +__global__ void init_lock_array_kernel_atomic() { + unsigned i = blockIdx.x*blockDim.x + threadIdx.x; + if(i<CUDA_SPACE_ATOMIC_MASK+1) { + Kokkos::Impl::g_device_cuda_lock_arrays.atomic[i] = 0; + } +} + +__global__ void init_lock_array_kernel_threadid(int N) { + unsigned i = blockIdx.x*blockDim.x + threadIdx.x; + if(i<(unsigned)N) { + Kokkos::Impl::g_device_cuda_lock_arrays.scratch[i] = 0; + } +} + +} // namespace + +namespace Impl { + +CudaLockArrays g_host_cuda_lock_arrays = { nullptr, nullptr, 0 }; + +void initialize_host_cuda_lock_arrays() { + if (g_host_cuda_lock_arrays.atomic != nullptr) return; + CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic, + sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1))); + CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch, + sizeof(int)*(Cuda::concurrency()))); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + g_host_cuda_lock_arrays.n = Cuda::concurrency(); + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); + init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+1+255)/256,256>>>(); + init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency()); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); +} + +void finalize_host_cuda_lock_arrays() { + if (g_host_cuda_lock_arrays.atomic == nullptr) return; + cudaFree(g_host_cuda_lock_arrays.atomic); + g_host_cuda_lock_arrays.atomic = nullptr; + cudaFree(g_host_cuda_lock_arrays.scratch); + g_host_cuda_lock_arrays.scratch = nullptr; + g_host_cuda_lock_arrays.n = 0; +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); +#endif +} + +} // namespace Impl + +} // namespace Kokkos + +#else + +void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {} + +#endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d01f06fb4fcd41845e1a09ec4270440bf164af02 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp @@ -0,0 +1,166 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_LOCKS_HPP +#define KOKKOS_CUDA_LOCKS_HPP + +#include <Kokkos_Macros.hpp> + +#ifdef KOKKOS_ENABLE_CUDA + +#include <cstdint> + +#include <Cuda/Kokkos_Cuda_Error.hpp> + +namespace Kokkos { +namespace Impl { + +struct CudaLockArrays { + std::int32_t* atomic; + std::int32_t* scratch; + std::int32_t n; +}; + +/// \brief This global variable in Host space is the central definition +/// of these arrays. +extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays ; + +/// \brief After this call, the g_host_cuda_lock_arrays variable has +/// valid, initialized arrays. +/// +/// This call is idempotent. +void initialize_host_cuda_lock_arrays(); + +/// \brief After this call, the g_host_cuda_lock_arrays variable has +/// all null pointers, and all array memory has been freed. +/// +/// This call is idempotent. +void finalize_host_cuda_lock_arrays(); + +} // namespace Impl +} // namespace Kokkos + +#if defined( __CUDACC__ ) + +namespace Kokkos { +namespace Impl { + +/// \brief This global variable in CUDA space is what kernels use +/// to get access to the lock arrays. +/// +/// When relocatable device code is enabled, there can be one single +/// instance of this global variable for the entire executable, +/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration +/// here must then be extern. +/// This one instance will be initialized by initialize_host_cuda_lock_arrays +/// and need not be modified afterwards. +/// +/// When relocatable device code is disabled, an instance of this variable +/// will be created in every translation unit that sees this header file +/// (we make this clear by marking it static, meaning no other translation +/// unit can link to it). +/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the +/// instances in other translation units, we must update this CUDA global +/// variable based on the Host global variable prior to running any kernels +/// that will use it. +/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro. +__device__ __constant__ +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE +extern +#endif +Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays ; + +#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF + +/// \brief Aquire a lock for the address +/// +/// This function tries to aquire the lock for the hash value derived +/// from the provided ptr. If the lock is successfully aquired the +/// function returns true. Otherwise it returns false. +__device__ inline +bool lock_address_cuda_space(void* ptr) { + size_t offset = size_t(ptr); + offset = offset >> 2; + offset = offset & CUDA_SPACE_ATOMIC_MASK; + return (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset],0,1)); +} + +/// \brief Release lock for the address +/// +/// This function releases the lock for the hash value derived +/// from the provided ptr. This function should only be called +/// after previously successfully aquiring a lock with +/// lock_address. +__device__ inline +void unlock_address_cuda_space(void* ptr) { + size_t offset = size_t(ptr); + offset = offset >> 2; + offset = offset & CUDA_SPACE_ATOMIC_MASK; + atomicExch( &Kokkos::Impl::g_device_cuda_lock_arrays.atomic[ offset ], 0); +} + +} // namespace Impl +} // namespace Kokkos + +/* Dan Ibanez: it is critical that this code be a macro, so that it will + capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays! + putting this in an inline function will NOT do the right thing! */ +#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ +{ \ + CUDA_SAFE_CALL(cudaMemcpyToSymbol( \ + Kokkos::Impl::g_device_cuda_lock_arrays , \ + & Kokkos::Impl::g_host_cuda_lock_arrays , \ + sizeof(Kokkos::Impl::CudaLockArrays) ) ); \ +} + +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() +#else +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() +#endif + +#endif /* defined( __CUDACC__ ) */ + +#endif /* defined( KOKKOS_ENABLE_CUDA ) */ + +#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp index 0c8c700e8f1e2a2a33789ec56ef5d5ac80f4496c..e2eab19e451c90c4235017989bbad490aebd662a 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -58,6 +58,7 @@ #include <Cuda/Kokkos_CudaExec.hpp> #include <Cuda/Kokkos_Cuda_ReduceScan.hpp> #include <Cuda/Kokkos_Cuda_Internal.hpp> +#include <Cuda/Kokkos_Cuda_Locks.hpp> #include <Kokkos_Vectorization.hpp> #if defined(KOKKOS_ENABLE_PROFILING) @@ -65,6 +66,8 @@ #include <typeinfo> #endif +#include <KokkosExp_MDRangePolicy.hpp> + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -318,6 +321,7 @@ private: typedef Kokkos::RangePolicy< Traits ... > Policy; typedef typename Policy::member_type Member ; typedef typename Policy::work_tag WorkTag ; + typedef typename Policy::launch_bounds LaunchBounds ; const FunctorType m_functor ; const Policy m_policy ; @@ -363,7 +367,7 @@ public: const dim3 block( 1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1); const dim3 grid( std::min( ( nwork + block.y - 1 ) / block.y , cuda_internal_maximum_grid_count() ) , 1 , 1); - CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 ); + CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 ); } ParallelFor( const FunctorType & arg_functor , @@ -373,6 +377,115 @@ public: { } }; + +// MDRangePolicy impl +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ... > + , Kokkos::Cuda + > +{ +private: + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ; + using RP = Policy; + typedef typename Policy::array_index_type array_index_type; + typedef typename Policy::index_type index_type; + typedef typename Policy::launch_bounds LaunchBounds; + + + const FunctorType m_functor ; + const Policy m_rp ; + +public: + + inline + __device__ + void operator()(void) const + { + Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range(); + } + + + inline + void execute() const + { + const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount); + if ( RP::rank == 2 ) + { + const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1); + const dim3 grid( + std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks ) + , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks ) + , 1 + ); + CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 ); + } + else if ( RP::rank == 3 ) + { + const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] ); + const dim3 grid( + std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks ) + , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks ) + , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks ) + ); + CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 ); + } + else if ( RP::rank == 4 ) + { + // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z + const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] ); + const dim3 grid( + std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] ) + , static_cast<index_type>(maxblocks) ) + , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks ) + , std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks ) + ); + CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 ); + } + else if ( RP::rank == 5 ) + { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z + const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] ); + const dim3 grid( + std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] ) + , static_cast<index_type>(maxblocks) ) + , std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] ) + , static_cast<index_type>(maxblocks) ) + , std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks ) + ); + CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 ); + } + else if ( RP::rank == 6 ) + { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z + const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] ); + const dim3 grid( + std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] ) + , static_cast<index_type>(maxblocks) ) + , std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] ) + , static_cast<index_type>(maxblocks) ) + , std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] ) + , static_cast<index_type>(maxblocks) ) + ); + CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 ); + } + else + { + printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); + Kokkos::abort("Aborting"); + } + + } //end execute + +// inline + ParallelFor( const FunctorType & arg_functor + , Policy arg_policy ) + : m_functor( arg_functor ) + , m_rp( arg_policy ) + {} +}; + + template< class FunctorType , class ... Properties > class ParallelFor< FunctorType , Kokkos::TeamPolicy< Properties ... > @@ -384,6 +497,7 @@ private: typedef TeamPolicyInternal< Kokkos::Cuda , Properties ... > Policy ; typedef typename Policy::member_type Member ; typedef typename Policy::work_tag WorkTag ; + typedef typename Policy::launch_bounds LaunchBounds ; public: @@ -430,15 +544,15 @@ public: if ( m_scratch_size[1]>0 ) { __shared__ int base_thread_id; if (threadIdx.x==0 && threadIdx.y==0 ) { - threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n; + threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n; threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y; - if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y; + if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y; int done = 0; while (!done) { - done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1)); + done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1)); if(!done) { threadid += blockDim.x * blockDim.y; - if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0; + if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0; } } base_thread_id = threadid; @@ -448,7 +562,8 @@ public: } - for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) { + const int int_league_size = (int)m_league_size; + for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) { this-> template exec_team< WorkTag >( typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>() @@ -462,7 +577,7 @@ public: if ( m_scratch_size[1]>0 ) { __syncthreads(); if (threadIdx.x==0 && threadIdx.y==0 ) - kokkos_impl_cuda_lock_arrays.atomic[threadid]=0; + Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0; } } @@ -473,7 +588,7 @@ public: const dim3 grid( int(m_league_size) , 1 , 1 ); const dim3 block( int(m_vector_size) , int(m_team_size) , 1 ); - CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute + CudaParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute } @@ -529,6 +644,7 @@ private: typedef typename Policy::WorkRange WorkRange ; typedef typename Policy::work_tag WorkTag ; typedef typename Policy::member_type Member ; + typedef typename Policy::launch_bounds LaunchBounds ; typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; @@ -563,6 +679,7 @@ private: typedef int DummySHMEMReductionType; public: + // Make the exec_range calls call to Reduce::DeviceIterateTile template< class TagType > __device__ inline typename std::enable_if< std::is_same< TagType , void >::value >::type @@ -686,7 +803,7 @@ public: const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y ); - CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute + CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute Cuda::fence(); @@ -737,6 +854,232 @@ public: { } }; + +// MDRangePolicy impl +template< class FunctorType , class ReducerType, class ... Traits > +class ParallelReduce< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ... > + , ReducerType + , Kokkos::Cuda + > +{ +private: + + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ; + typedef typename Policy::array_index_type array_index_type; + typedef typename Policy::index_type index_type; + + typedef typename Policy::work_tag WorkTag ; + typedef typename Policy::member_type Member ; + typedef typename Policy::launch_bounds LaunchBounds; + + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; + +public: + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::value_type value_type ; + typedef typename ValueTraits::reference_type reference_type ; + typedef FunctorType functor_type ; + typedef Cuda::size_type size_type ; + + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1 + + const FunctorType m_functor ; + const Policy m_policy ; // used for workrange and nwork + const ReducerType m_reducer ; + const pointer_type m_result_ptr ; + size_type * m_scratch_space ; + size_type * m_scratch_flags ; + size_type * m_unified_space ; + + typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern; + + // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit + enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) }; + // Some crutch to do function overloading +private: + typedef double DummyShflReductionType; + typedef int DummySHMEMReductionType; + +public: + inline + __device__ + void + exec_range( reference_type update ) const + { + Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range(); + } + + inline + __device__ + void operator() (void) const { + run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) ); + } + + __device__ inline + void run(const DummySHMEMReductionType& ) const + { + const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) > + word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) ); + + { + reference_type value = + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value ); + + // Number of blocks is bounded so that the reduction can be limited to two passes. + // Each thread block is given an approximately equal amount of work to perform. + // Accumulate the values for this block. + // The accumulation ordering does not match the final pass, but is arithmatically equivalent. + + this-> exec_range( value ); + } + + // Reduce with final value at blockDim.y - 1 location. + // Problem: non power-of-two blockDim + if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>( + ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x , + kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) { + + // This is the final block with the final result at the final threads' location + size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ; + size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ; + + if ( threadIdx.y == 0 ) { + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared ); + } + + if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } + + for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; } + } + } + + __device__ inline + void run(const DummyShflReductionType&) const + { + + value_type value; + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value); + // Number of blocks is bounded so that the reduction can be limited to two passes. + // Each thread block is given an approximately equal amount of work to perform. + // Accumulate the values for this block. + // The accumulation ordering does not match the final pass, but is arithmatically equivalent. + + const Member work_part = + ( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion of tiles handled by each block + + this-> exec_range( value ); + + pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ; + + int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y; + max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread; + + value_type init; + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init); + if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag> + (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) { + const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; + if(id==0) { + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); + *result = value; + } + } + } + + // Determine block size constrained by shared memory: + static inline + unsigned local_block_size( const FunctorType & f ) + { + unsigned n = CudaTraits::WarpSize * 8 ; + while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; } + return n ; + } + + inline + void execute() + { + const int nwork = m_policy.m_num_tiles; + if ( nwork ) { + int block_size = m_policy.m_prod_tile_dims; + // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions + // Nearest power of two + int exponent_pow_two = std::ceil( std::log2(block_size) ); + block_size = std::pow(2, exponent_pow_two); + int suggested_blocksize = local_block_size( m_functor ); + + block_size = (block_size > suggested_blocksize) ? block_size : suggested_blocksize ; //Note: block_size must be less than or equal to 512 + + + m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ ); + m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) ); + m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) ); + + // REQUIRED ( 1 , N , 1 ) + const dim3 block( 1 , block_size , 1 ); + // Required grid.x <= block.y + const dim3 grid( std::min( int(block.y) , int( nwork ) ) , 1 , 1 ); + + const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y ); + + CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute + + Cuda::fence(); + + if ( m_result_ptr ) { + if ( m_unified_space ) { + const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); + for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; } + } + else { + const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ); + DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size ); + } + } + } + else { + if (m_result_ptr) { + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr ); + } + } + } + + template< class HostViewType > + ParallelReduce( const FunctorType & arg_functor + , const Policy & arg_policy + , const HostViewType & arg_result + , typename std::enable_if< + Kokkos::is_view< HostViewType >::value + ,void*>::type = NULL) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + , m_reducer( InvalidType() ) + , m_result_ptr( arg_result.ptr_on_device() ) + , m_scratch_space( 0 ) + , m_scratch_flags( 0 ) + , m_unified_space( 0 ) + {} + + ParallelReduce( const FunctorType & arg_functor + , const Policy & arg_policy + , const ReducerType & reducer) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + , m_reducer( reducer ) + , m_result_ptr( reducer.view().ptr_on_device() ) + , m_scratch_space( 0 ) + , m_scratch_flags( 0 ) + , m_unified_space( 0 ) + {} +}; + + //---------------------------------------------------------------------------- #if 1 @@ -753,6 +1096,7 @@ private: typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... > Policy ; typedef typename Policy::member_type Member ; typedef typename Policy::work_tag WorkTag ; + typedef typename Policy::launch_bounds LaunchBounds ; typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; @@ -819,15 +1163,15 @@ public: if ( m_scratch_size[1]>0 ) { __shared__ int base_thread_id; if (threadIdx.x==0 && threadIdx.y==0 ) { - threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n; + threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n; threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y; - if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y; + if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y; int done = 0; while (!done) { - done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1)); + done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1)); if(!done) { threadid += blockDim.x * blockDim.y; - if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0; + if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0; } } base_thread_id = threadid; @@ -840,7 +1184,7 @@ public: if ( m_scratch_size[1]>0 ) { __syncthreads(); if (threadIdx.x==0 && threadIdx.y==0 ) - kokkos_impl_cuda_lock_arrays.atomic[threadid]=0; + Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0; } } @@ -854,7 +1198,8 @@ public: ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value ); // Iterate this block through the league - for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) { + const int int_league_size = (int)m_league_size; + for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) { this-> template exec_team< WorkTag > ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin , m_shmem_begin @@ -894,7 +1239,8 @@ public: ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value); // Iterate this block through the league - for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) { + const int int_league_size = (int)m_league_size; + for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) { this-> template exec_team< WorkTag > ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin , m_shmem_begin @@ -936,7 +1282,7 @@ public: const dim3 grid( block_count , 1 , 1 ); const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ; - CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute + CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute Cuda::fence(); @@ -975,12 +1321,6 @@ public: , m_shmem_begin( 0 ) , m_shmem_size( 0 ) , m_scratch_ptr{NULL,NULL} - , m_league_size( arg_policy.league_size() ) - , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() : - Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(), - arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / - arg_policy.vector_length() ) - , m_vector_size( arg_policy.vector_length() ) , m_scratch_size{ arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() : Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(), @@ -991,6 +1331,12 @@ public: arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() ) )} + , m_league_size( arg_policy.league_size() ) + , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() : + Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(), + arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / + arg_policy.vector_length() ) + , m_vector_size( arg_policy.vector_length() ) { // Return Init value if the number of worksets is zero if( arg_policy.league_size() == 0) { @@ -1150,6 +1496,7 @@ private: typedef typename reducer_type<>::pointer_type pointer_type ; typedef typename reducer_type<>::reference_type reference_type ; typedef typename reducer_type<>::value_type value_type ; + typedef typename Policy::launch_bounds LaunchBounds ; typedef Kokkos::Impl::FunctorAnalysis < Kokkos::Impl::FunctorPatternInterface::REDUCE @@ -1273,7 +1620,7 @@ public: const int shmem = m_shmem_team_begin + m_shmem_team_size ; // copy to device and execute - CudaParallelLaunch<ParallelReduce>( *this, grid, block, shmem ); + CudaParallelLaunch<ParallelReduce,LaunchBounds>( *this, grid, block, shmem ); Cuda::fence(); @@ -1373,7 +1720,7 @@ public: if ( CudaTraits::WarpSize < team_threads ) { // Need inter-warp team reduction (collectives) shared memory - // Speculate an upper bound for the value size + // Speculate an upper bound for the value size m_shmem_team_begin = align_scratch( CudaTraits::warp_count(team_threads) * sizeof(double) ); @@ -1426,7 +1773,7 @@ public: // Reduce space has claim flag followed by vaue buffer const int global_reduce_value_size = - max_concurrent_block * + max_concurrent_block * ( aligned_flag_size + align_scratch( value_size ) ); // Scratch space has claim flag followed by scratch buffer @@ -1469,6 +1816,7 @@ private: typedef typename Policy::member_type Member ; typedef typename Policy::work_tag WorkTag ; typedef typename Policy::WorkRange WorkRange ; + typedef typename Policy::launch_bounds LaunchBounds ; typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; @@ -1655,10 +2003,10 @@ public: const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 ); m_final = false ; - CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute + CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute m_final = true ; - CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute + CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute } } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 432c7895ccf88e6fbf14172c0491e83959c0d3a0..709cbbd534c9c6df2895a5591efc4f25b311139f 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -151,7 +151,7 @@ template< class ValueType , class JoinOp> __device__ inline void cuda_intra_warp_reduction( ValueType& result, const JoinOp& join, - const int max_active_thread = blockDim.y) { + const uint32_t max_active_thread = blockDim.y) { unsigned int shift = 1; @@ -268,29 +268,33 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT if( id + 1 < int(gridDim.x) ) join(value, tmp); } + int active = __ballot(1); if (int(blockDim.x*blockDim.y) > 2) { value_type tmp = Kokkos::shfl_down(value, 2,32); if( id + 2 < int(gridDim.x) ) join(value, tmp); } + active += __ballot(1); if (int(blockDim.x*blockDim.y) > 4) { value_type tmp = Kokkos::shfl_down(value, 4,32); if( id + 4 < int(gridDim.x) ) join(value, tmp); } + active += __ballot(1); if (int(blockDim.x*blockDim.y) > 8) { value_type tmp = Kokkos::shfl_down(value, 8,32); if( id + 8 < int(gridDim.x) ) join(value, tmp); } + active += __ballot(1); if (int(blockDim.x*blockDim.y) > 16) { value_type tmp = Kokkos::shfl_down(value, 16,32); if( id + 16 < int(gridDim.x) ) join(value, tmp); } + active += __ballot(1); } } - //The last block has in its thread=0 the global reduction value through "value" return last_block; #else @@ -302,7 +306,7 @@ template< class ReducerType > __device__ inline typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type cuda_intra_warp_reduction( const ReducerType& reducer, - const int max_active_thread = blockDim.y) { + const uint32_t max_active_thread = blockDim.y) { typedef typename ReducerType::value_type ValueType; @@ -428,26 +432,31 @@ cuda_inter_block_reduction( const ReducerType& reducer, if( id + 1 < int(gridDim.x) ) reducer.join(value, tmp); } + int active = __ballot(1); if (int(blockDim.x*blockDim.y) > 2) { value_type tmp = Kokkos::shfl_down(value, 2,32); if( id + 2 < int(gridDim.x) ) reducer.join(value, tmp); } + active += __ballot(1); if (int(blockDim.x*blockDim.y) > 4) { value_type tmp = Kokkos::shfl_down(value, 4,32); if( id + 4 < int(gridDim.x) ) reducer.join(value, tmp); } + active += __ballot(1); if (int(blockDim.x*blockDim.y) > 8) { value_type tmp = Kokkos::shfl_down(value, 8,32); if( id + 8 < int(gridDim.x) ) reducer.join(value, tmp); } + active += __ballot(1); if (int(blockDim.x*blockDim.y) > 16) { value_type tmp = Kokkos::shfl_down(value, 16,32); if( id + 16 < int(gridDim.x) ) reducer.join(value, tmp); } + active += __ballot(1); } } @@ -594,7 +603,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor , typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ; typedef typename ValueTraits::pointer_type pointer_type ; - typedef typename ValueTraits::reference_type reference_type ; + //typedef typename ValueTraits::reference_type reference_type ; // '__ffs' = position of the least significant bit set to 1. // 'blockDim.y' is guaranteed to be a power of two so this @@ -637,7 +646,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor , { void * const shared_ptr = shared_data + word_count.value * threadIdx.y ; - reference_type shared_value = ValueInit::init( functor , shared_ptr ); + /* reference_type shared_value = */ ValueInit::init( functor , shared_ptr ); for ( size_type i = b ; i < e ; ++i ) { ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i ); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp index 3c6f0a5ddaeb78f9252cbd3da7226f14774bf970..5f08800c40292efaeadaf74ae5b70e7a47974fb9 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp @@ -58,25 +58,56 @@ template class TaskQueue< Kokkos::Cuda > ; //---------------------------------------------------------------------------- +#if defined( KOKKOS_DEBUG ) + +__device__ +void verify_warp_convergence( const char * const where ) +{ + const unsigned b = __ballot(1); + + if ( b != ~0u ) { + +printf(" verify_warp_convergence( %s ) (%d,%d,%d) (%d,%d,%d) failed %x\n" + , where + , blockIdx.x + , blockIdx.y + , blockIdx.z + , threadIdx.x + , threadIdx.y + , threadIdx.z + , b ); + + } +} + +#endif // #if defined( KOKKOS_DEBUG ) + +//---------------------------------------------------------------------------- + __device__ void TaskQueueSpecialization< Kokkos::Cuda >::driver - ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue ) + ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue + , int32_t shmem_per_warp ) { using Member = TaskExec< Kokkos::Cuda > ; using Queue = TaskQueue< Kokkos::Cuda > ; - using task_root_type = TaskBase< Kokkos::Cuda , void , void > ; + using task_root_type = TaskBase< void , void , void > ; + + extern __shared__ int32_t shmem_all[]; task_root_type * const end = (task_root_type *) task_root_type::EndTag ; - Member single_exec( 1 ); - Member team_exec( blockDim.y ); + int32_t * const warp_shmem = + shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t); + + task_root_type * const task_shmem = (task_root_type *) warp_shmem ; const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ; - union { - task_root_type * ptr ; - int raw[2] ; - } task ; + Member single_exec( warp_shmem , 1 ); + Member team_exec( warp_shmem , blockDim.y ); + + task_root_type * task_ptr ; // Loop until all queues are empty and no tasks in flight @@ -87,41 +118,86 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver if ( 0 == warp_lane ) { - task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ; + task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ; // Loop by priority and then type - for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) { - for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) { - task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] ); + for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) { + for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) { + task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] ); } } #if 0 printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x - , uintptr_t(task.ptr)); + , uintptr_t(task_ptr)); #endif } // shuffle broadcast - task.raw[0] = __shfl( task.raw[0] , 0 ); - task.raw[1] = __shfl( task.raw[1] , 0 ); + ((int*) & task_ptr )[0] = __shfl( ((int*) & task_ptr )[0] , 0 ); + ((int*) & task_ptr )[1] = __shfl( ((int*) & task_ptr )[1] , 0 ); + +#if defined( KOKKOS_DEBUG ) + verify_warp_convergence("task_ptr"); +#endif + + if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count + + if ( end != task_ptr ) { - if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count + // Whole warp copy task's closure to/from shared memory. + // Use all threads of warp for coalesced read/write. - if ( end != task.ptr ) { - if ( task_root_type::TaskTeam == task.ptr->m_task_type ) { + int32_t const b = sizeof(task_root_type) / sizeof(int32_t); + int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t); + + int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ; + + // copy global to shared memory: + + for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) { + warp_shmem[i] = task_mem[i] ; + } + + Kokkos::memory_fence(); + + // Copy done - use memory fence so that memory writes are visible. + // For reliable warp convergence on Pascal and Volta an explicit + // warp level synchronization will also be required. + + if ( task_root_type::TaskTeam == task_shmem->m_task_type ) { // Thread Team Task - (*task.ptr->m_apply)( task.ptr , & team_exec ); + (*task_shmem->m_apply)( task_shmem , & team_exec ); } else if ( 0 == threadIdx.y ) { // Single Thread Task - (*task.ptr->m_apply)( task.ptr , & single_exec ); + (*task_shmem->m_apply)( task_shmem , & single_exec ); } + // copy shared to global memory: + + for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) { + task_mem[i] = warp_shmem[i] ; + } + + Kokkos::memory_fence(); + +#if defined( KOKKOS_DEBUG ) + verify_warp_convergence("apply"); +#endif + + // If respawn requested copy respawn data back to main memory + if ( 0 == warp_lane ) { - queue->complete( task.ptr ); + + if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) { + ( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ; + ( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ; + } + + queue->complete( task_ptr ); } } } while(1); @@ -130,18 +206,20 @@ printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x namespace { __global__ -void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue ) -{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); } +void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue + , int32_t shmem_size ) +{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); } } void TaskQueueSpecialization< Kokkos::Cuda >::execute ( TaskQueue< Kokkos::Cuda > * const queue ) { + const int shared_per_warp = 2048 ; const int warps_per_block = 4 ; const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 ); const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block ); - const int shared = 0 ; + const int shared_total = shared_per_warp * warps_per_block ; const cudaStream_t stream = 0 ; CUDA_SAFE_CALL( cudaDeviceSynchronize() ); @@ -159,7 +237,7 @@ printf("cuda_task_queue_execute before\n"); // // CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) ); - cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue ); + cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp ); CUDA_SAFE_CALL( cudaGetLastError() ); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index 5d08219ea5bf9767a49de4a602e7625b49eeb069..4a52985d29422a4490ae9271338fd04db2b1cda6 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -57,7 +57,7 @@ namespace { template< typename TaskType > __global__ void set_cuda_task_base_apply_function_pointer - ( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr ) + ( TaskBase<void,void,void>::function_type * ptr ) { *ptr = TaskType::apply ; } } @@ -78,7 +78,7 @@ public: void iff_single_thread_recursive_execute( queue_type * const ) {} __device__ - static void driver( queue_type * const ); + static void driver( queue_type * const , int32_t ); static void execute( queue_type * const ); @@ -106,7 +106,14 @@ public: extern template class TaskQueue< Kokkos::Cuda > ; +}} /* namespace Kokkos::Impl */ + //---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + /**\brief Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type * passed to tasks running in a Cuda space. * @@ -134,11 +141,13 @@ private: friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ; friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ; + int32_t * m_team_shmem ; const int m_team_size ; __device__ - TaskExec( int arg_team_size = blockDim.y ) - : m_team_size( arg_team_size ) {} + TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y ) + : m_team_shmem( arg_team_shmem ) + , m_team_size( arg_team_size ) {} public: @@ -154,8 +163,14 @@ public: }; +}} /* namespace Kokkos::Impl */ + +//---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +namespace Kokkos { +namespace Impl { + template<typename iType> struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > > { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index 084daa098b07662c475d4e28a7f3e3b763a227c4..3f3d85ecd1386329b2ab1ae7c98a773ca9a63f50 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -106,7 +106,7 @@ private: typedef Kokkos::Cuda execution_space ; typedef execution_space::scratch_memory_space scratch_memory_space ; - void * m_team_reduce ; + mutable void * m_team_reduce ; scratch_memory_space m_team_shared ; int m_team_reduce_size ; int m_league_rank ; @@ -166,7 +166,7 @@ public: if ( 1 == blockDim.z ) { // team == block __syncthreads(); // Wait for shared data write until all threads arrive here - if ( threadIdx.x == 0 && threadIdx.y == thread_id ) { + if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) { *((ValueType*) m_team_reduce) = val ; } __syncthreads(); // Wait for shared data read until root thread writes @@ -210,7 +210,7 @@ public: const int wx = ( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ; - for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) { + for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) { cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize ); @@ -354,7 +354,7 @@ public: for ( int i = blockDim.x ; ( i >>= 1 ) ; ) { cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x ); - if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); } + if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); } } // Broadcast from root lane to all other lanes. @@ -410,7 +410,7 @@ public: value_type tmp( reducer.reference() ); - for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) { + for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) { cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize ); @@ -479,7 +479,7 @@ public: __threadfence(); // Wait until global write is visible. - last_block = gridDim.x == + last_block = (int)gridDim.x == 1 + Kokkos::atomic_fetch_add(global_scratch_flags,1); // If last block then reset count @@ -509,7 +509,7 @@ public: reducer.copy( ((pointer_type)shmem) + offset , ((pointer_type)global_scratch_space) + offset ); - for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) { + for ( int i = nentry + tid ; i < (int)gridDim.x ; i += nentry ) { reducer.join( ((pointer_type)shmem) + offset , ((pointer_type)global_scratch_space) + i * reducer.length() ); @@ -576,6 +576,14 @@ public: , m_league_size( arg_league_size ) {} +public: + // Declare to avoid unused private member warnings which are trigger + // when SFINAE excludes the member function which uses these variables + // Making another class a friend also surpresses these warnings + bool impl_avoid_sfinae_warning() const noexcept + { + return m_team_reduce_size > 0 && m_team_reduce != nullptr; + } }; } // namspace Impl @@ -913,10 +921,10 @@ void parallel_scan // [t] += [t-4] if t >= 4 // ... - for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) { + for ( int j = 1 ; j < (int)blockDim.x ; j <<= 1 ) { value_type tmp = 0 ; Impl::cuda_shfl_up( tmp , sval , j , blockDim.x ); - if ( j <= threadIdx.x ) { sval += tmp ; } + if ( j <= (int)threadIdx.x ) { sval += tmp ; } } // Include accumulation and remove value for exclusive scan: diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e11ae4798fbc945fbf3f3eb84eb745d0366af6a0 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp @@ -0,0 +1,133 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_UNIQUE_TOKEN_HPP +#define KOKKOS_CUDA_UNIQUE_TOKEN_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_CUDA + +#include <Kokkos_CudaSpace.hpp> +#include <Kokkos_UniqueToken.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> +#include <impl/Kokkos_ConcurrentBitset.hpp> + +namespace Kokkos { namespace Experimental { + +// both global and instance Unique Tokens are implemented in the same way +template<> +class UniqueToken< Cuda, UniqueTokenScope::Global > +{ +private: + + uint32_t volatile * m_buffer ; + uint32_t m_count ; + +public: + + using execution_space = Cuda; + + explicit + UniqueToken( execution_space const& ); + + KOKKOS_INLINE_FUNCTION + UniqueToken() : m_buffer(0), m_count(0) {} + + KOKKOS_INLINE_FUNCTION + UniqueToken( const UniqueToken & ) = default; + + KOKKOS_INLINE_FUNCTION + UniqueToken( UniqueToken && ) = default; + + KOKKOS_INLINE_FUNCTION + UniqueToken & operator=( const UniqueToken & ) = default ; + + KOKKOS_INLINE_FUNCTION + UniqueToken & operator=( UniqueToken && ) = default ; + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int32_t size() const noexcept { return m_count ; } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int32_t acquire() const + { + const Kokkos::pair<int,int> result = + Kokkos::Impl::concurrent_bitset:: + acquire_bounded( m_buffer + , m_count + , Kokkos::Impl::clock_tic() % m_count + ); + + if ( result.first < 0 ) { + Kokkos::abort("UniqueToken<Cuda> failure to release tokens, no tokens available" ); + } + + return result.first; + } + + /// \brief release an acquired value + KOKKOS_INLINE_FUNCTION + void release( int32_t i ) const noexcept + { + Kokkos::Impl::concurrent_bitset::release( m_buffer, i ); + } +}; + +template<> +class UniqueToken< Cuda, UniqueTokenScope::Instance > + : public UniqueToken< Cuda, UniqueTokenScope::Global > +{ +public: + + explicit + UniqueToken( execution_space const& arg ) + : UniqueToken< Cuda, UniqueTokenScope::Global >( arg ) {} +}; + +}} // namespace Kokkos::Experimental + +#endif // KOKKOS_ENABLE_CUDA +#endif // KOKKOS_CUDA_UNIQUE_TOKEN_HPP + diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp index f5e2d87fb67982b0cfb7fcd9b7e6be254b9f790d..d641622bb6259abfd64602eb1e93436fc55f5772 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp @@ -221,7 +221,6 @@ struct CudaLDGFetch { //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { /** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization @@ -294,9 +293,8 @@ public: } }; -} -} -} +} // namespace Impl +} // namespace Kokkos //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..99778c64b1db0726acad3381cc1572c2899ef2b4 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -0,0 +1,119 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_WORKGRAPHPOLICY_HPP +#define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType , + Kokkos::Experimental::WorkGraphPolicy< Traits ... > , + Kokkos::Cuda + > + : public Kokkos::Impl::Experimental:: + WorkGraphExec< FunctorType, + Kokkos::Cuda, + Traits ... + > +{ +public: + + typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ; + typedef Kokkos::Impl::Experimental:: + WorkGraphExec<FunctorType, Kokkos::Cuda, Traits ... > Base ; + typedef ParallelFor<FunctorType, Policy, Kokkos::Cuda> Self ; + +private: + + template< class TagType > + __device__ + typename std::enable_if< std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + Base::m_functor( i ); + } + + template< class TagType > + __device__ + typename std::enable_if< ! std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + const TagType t{} ; + Base::m_functor( t , i ); + } + +public: + + __device__ + inline + void operator()() const { + for (std::int32_t i; (-1 != (i = Base::before_work())); ) { + exec_one< typename Policy::work_tag >( i ); + Base::after_work(i); + } + } + + inline + void execute() + { + const int warps_per_block = 4 ; + const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 ); + const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block ); + const int shared = 0 ; + const cudaStream_t stream = 0 ; + + Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, stream); + } + + inline + ParallelFor( const FunctorType & arg_functor + , const Policy & arg_policy ) + : Base( arg_functor, arg_policy ) + { + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP */ diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index 4f68d9c2c091a9355fae1a29fd7fc4567fc2eb2b..6ef7443a149f4667b407e39f54b9e0b68055f86a 100644 --- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -52,6 +52,7 @@ #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA ) #include<Cuda/KokkosExp_Cuda_IterateTile.hpp> +#include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp> #endif namespace Kokkos { namespace Experimental { @@ -120,13 +121,53 @@ struct MDRangePolicy , typename traits::index_type > ; + typedef MDRangePolicy execution_policy; // needed for is_execution_space interrogation + static_assert( !std::is_same<typename traits::iteration_pattern,void>::value , "Kokkos Error: MD iteration pattern not defined" ); using iteration_pattern = typename traits::iteration_pattern; using work_tag = typename traits::work_tag; + using launch_bounds = typename traits::launch_bounds; + using member_type = typename range_policy::member_type; + + enum { rank = static_cast<int>(iteration_pattern::rank) }; + + using index_type = typename traits::index_type; + using array_index_type = long; + using point_type = Kokkos::Array<array_index_type,rank>; //was index_type + using tile_type = Kokkos::Array<array_index_type,rank>; + // If point_type or tile_type is not templated on a signed integral type (if it is unsigned), + // then if user passes in intializer_list of runtime-determined values of + // signed integral type that are not const will receive a compiler error due + // to an invalid case for implicit conversion - + // "conversion from integer or unscoped enumeration type to integer type that cannot represent all values of the original, except where source is a constant expression whose value can be stored exactly in the target type" + // This would require the user to either pass a matching index_type parameter + // as template parameter to the MDRangePolicy or static_cast the individual values + + point_type m_lower; + point_type m_upper; + tile_type m_tile; + point_type m_tile_end; + index_type m_num_tiles; + index_type m_prod_tile_dims; + +/* + // NDE enum impl definition alternative - replace static constexpr int ? + enum { outer_direction = static_cast<int> ( + (iteration_pattern::outer_direction != Iterate::Default) + ? iteration_pattern::outer_direction + : default_outer_direction< typename traits::execution_space>::value ) }; + + enum { inner_direction = static_cast<int> ( + iteration_pattern::inner_direction != Iterate::Default + ? iteration_pattern::inner_direction + : default_inner_direction< typename traits::execution_space>::value ) }; - static constexpr int rank = iteration_pattern::rank; + enum { Right = static_cast<int>( Iterate::Right ) }; + enum { Left = static_cast<int>( Iterate::Left ) }; +*/ + //static constexpr int rank = iteration_pattern::rank; static constexpr int outer_direction = static_cast<int> ( (iteration_pattern::outer_direction != Iterate::Default) @@ -138,28 +179,16 @@ struct MDRangePolicy ? iteration_pattern::inner_direction : default_inner_direction< typename traits::execution_space>::value ) ; - // Ugly ugly workaround intel 14 not handling scoped enum correctly static constexpr int Right = static_cast<int>( Iterate::Right ); static constexpr int Left = static_cast<int>( Iterate::Left ); - using index_type = typename traits::index_type; - using array_index_type = long; - using point_type = Kokkos::Array<array_index_type,rank>; //was index_type - using tile_type = Kokkos::Array<array_index_type,rank>; - // If point_type or tile_type is not templated on a signed integral type (if it is unsigned), - // then if user passes in intializer_list of runtime-determined values of - // signed integral type that are not const will receive a compiler error due - // to an invalid case for implicit conversion - - // "conversion from integer or unscoped enumeration type to integer type that cannot represent all values of the original, except where source is a constant expression whose value can be stored exactly in the target type" - // This would require the user to either pass a matching index_type parameter - // as template parameter to the MDRangePolicy or static_cast the individual values - MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} ) : m_lower(lower) , m_upper(upper) , m_tile(tile) , m_num_tiles(1) + , m_prod_tile_dims(1) { // Host if ( true @@ -172,8 +201,8 @@ struct MDRangePolicy for (int i=0; i<rank; ++i) { span = upper[i] - lower[i]; if ( m_tile[i] <= 0 ) { - if ( (inner_direction == Right && (i < rank-1)) - || (inner_direction == Left && (i > 0)) ) + if ( ((int)inner_direction == (int)Right && (i < rank-1)) + || ((int)inner_direction == (int)Left && (i > 0)) ) { m_tile[i] = 2; } @@ -183,6 +212,7 @@ struct MDRangePolicy } m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]); m_num_tiles *= m_tile_end[i]; + m_prod_tile_dims *= m_tile[i]; } } #if defined(KOKKOS_ENABLE_CUDA) @@ -190,14 +220,18 @@ struct MDRangePolicy { index_type span; for (int i=0; i<rank; ++i) { - span = upper[i] - lower[i]; + span = m_upper[i] - m_lower[i]; if ( m_tile[i] <= 0 ) { // TODO: determine what is a good default tile size for cuda // may be rank dependent - if ( (inner_direction == Right && (i < rank-1)) - || (inner_direction == Left && (i > 0)) ) + if ( ((int)inner_direction == (int)Right && (i < rank-1)) + || ((int)inner_direction == (int)Left && (i > 0)) ) { - m_tile[i] = 2; + if ( m_prod_tile_dims < 512 ) { + m_tile[i] = 2; + } else { + m_tile[i] = 1; + } } else { m_tile[i] = 16; @@ -205,12 +239,9 @@ struct MDRangePolicy } m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]); m_num_tiles *= m_tile_end[i]; + m_prod_tile_dims *= m_tile[i]; } - index_type total_tile_size_check = 1; - for (int i=0; i<rank; ++i) { - total_tile_size_check *= m_tile[i]; - } - if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit + if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024 printf(" Tile dimensions exceed Cuda limits\n"); Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims"); //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims"); @@ -223,19 +254,7 @@ struct MDRangePolicy template < typename LT , typename UT , typename TT = array_index_type > MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} ) { -#if 0 - // This should work, less duplicated code but not yet extensively tested - point_type lower_tmp, upper_tmp; - tile_type tile_tmp; - for ( auto i = 0; i < rank; ++i ) { - lower_tmp[i] = static_cast<array_index_type>(lower.begin()[i]); - upper_tmp[i] = static_cast<array_index_type>(upper.begin()[i]); - tile_tmp[i] = static_cast<array_index_type>(tile.begin()[i]); - } - - MDRangePolicy( lower_tmp, upper_tmp, tile_tmp ); -#else if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank) Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size"); @@ -249,7 +268,7 @@ struct MDRangePolicy } m_num_tiles = 1; - + m_prod_tile_dims = 1; // Host if ( true @@ -262,8 +281,8 @@ struct MDRangePolicy for (int i=0; i<rank; ++i) { span = m_upper[i] - m_lower[i]; if ( m_tile[i] <= 0 ) { - if ( (inner_direction == Right && (i < rank-1)) - || (inner_direction == Left && (i > 0)) ) + if ( ((int)inner_direction == (int)Right && (i < rank-1)) + || ((int)inner_direction == (int)Left && (i > 0)) ) { m_tile[i] = 2; } @@ -273,6 +292,7 @@ struct MDRangePolicy } m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]); m_num_tiles *= m_tile_end[i]; + m_prod_tile_dims *= m_tile[i]; } } #if defined(KOKKOS_ENABLE_CUDA) @@ -284,10 +304,14 @@ struct MDRangePolicy if ( m_tile[i] <= 0 ) { // TODO: determine what is a good default tile size for cuda // may be rank dependent - if ( (inner_direction == Right && (i < rank-1)) - || (inner_direction == Left && (i > 0)) ) + if ( ((int)inner_direction == (int)Right && (i < rank-1)) + || ((int)inner_direction == (int)Left && (i > 0)) ) { - m_tile[i] = 2; + if ( m_prod_tile_dims < 512 ) { + m_tile[i] = 2; + } else { + m_tile[i] = 1; + } } else { m_tile[i] = 16; @@ -295,32 +319,22 @@ struct MDRangePolicy } m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]); m_num_tiles *= m_tile_end[i]; + m_prod_tile_dims *= m_tile[i]; } - index_type total_tile_size_check = 1; - for (int i=0; i<rank; ++i) { - total_tile_size_check *= m_tile[i]; - } - if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit + if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024 printf(" Tile dimensions exceed Cuda limits\n"); Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims"); //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims"); } } #endif -#endif } - - point_type m_lower; - point_type m_upper; - tile_type m_tile; - point_type m_tile_end; - index_type m_num_tiles; }; // ------------------------------------------------------------------ // // ------------------------------------------------------------------ // -//md_parallel_for +//md_parallel_for - deprecated use parallel_for // ------------------------------------------------------------------ // template <typename MDRange, typename Functor, typename Enable = void> void md_parallel_for( MDRange const& range @@ -335,7 +349,6 @@ void md_parallel_for( MDRange const& range { Impl::MDFunctor<MDRange, Functor, void> g(range, f); - //using range_policy = typename MDRange::range_policy; using range_policy = typename MDRange::impl_range_policy; Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str ); @@ -354,7 +367,6 @@ void md_parallel_for( const std::string& str { Impl::MDFunctor<MDRange, Functor, void> g(range, f); - //using range_policy = typename MDRange::range_policy; using range_policy = typename MDRange::impl_range_policy; Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str ); @@ -395,7 +407,7 @@ void md_parallel_for( MDRange const& range // ------------------------------------------------------------------ // // ------------------------------------------------------------------ // -//md_parallel_reduce +//md_parallel_reduce - deprecated use parallel_reduce // ------------------------------------------------------------------ // template <typename MDRange, typename Functor, typename ValueType> void md_parallel_reduce( MDRange const& range @@ -409,9 +421,8 @@ void md_parallel_reduce( MDRange const& range ) >::type* = 0 ) { - Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v); + Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f); - //using range_policy = typename MDRange::range_policy; using range_policy = typename MDRange::impl_range_policy; Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v ); } @@ -428,48 +439,14 @@ void md_parallel_reduce( const std::string& str ) >::type* = 0 ) { - Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v); + Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f); - //using range_policy = typename MDRange::range_policy; using range_policy = typename MDRange::impl_range_policy; Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v ); } -// Cuda - parallel_reduce not implemented yet -/* -template <typename MDRange, typename Functor, typename ValueType> -void md_parallel_reduce( MDRange const& range - , Functor const& f - , ValueType & v - , const std::string& str = "" - , typename std::enable_if<( true - #if defined( KOKKOS_ENABLE_CUDA) - && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value - #endif - ) >::type* = 0 - ) -{ - Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v); - closure.execute(); -} - -template <typename MDRange, typename Functor, typename ValueType> -void md_parallel_reduce( const std::string& str - , MDRange const& range - , Functor const& f - , ValueType & v - , typename std::enable_if<( true - #if defined( KOKKOS_ENABLE_CUDA) - && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value - #endif - ) >::type* = 0 - ) -{ - Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v); - closure.execute(); -} -*/ +// Cuda - md_parallel_reduce not implemented - use parallel_reduce }} // namespace Kokkos::Experimental diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp index 3ecae24da4fc24b5f7980e72e57b5740b82136d8..3c8673c66a4783cf6dabcb6096568cc8cf8f0d7d 100644 --- a/lib/kokkos/core/src/Kokkos_Atomic.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp @@ -114,40 +114,9 @@ #endif /* Not pre-selected atomic implementation */ #endif -//---------------------------------------------------------------------------- - -// Forward decalaration of functions supporting arbitrary sized atomics -// This is necessary since Kokkos_Atomic.hpp is internally included very early -// through Kokkos_HostSpace.hpp as well as the allocation tracker. #ifdef KOKKOS_ENABLE_CUDA -namespace Kokkos { -namespace Impl { -/// \brief Aquire a lock for the address -/// -/// This function tries to aquire the lock for the hash value derived -/// from the provided ptr. If the lock is successfully aquired the -/// function returns true. Otherwise it returns false. -#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -extern +#include <Cuda/Kokkos_Cuda_Locks.hpp> #endif -__device__ inline -bool lock_address_cuda_space(void* ptr); - -/// \brief Release lock for the address -/// -/// This function releases the lock for the hash value derived -/// from the provided ptr. This function should only be called -/// after previously successfully aquiring a lock with -/// lock_address. -#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -extern -#endif -__device__ inline -void unlock_address_cuda_space(void* ptr); -} -} -#endif - namespace Kokkos { template <typename T> diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp index 9a2b53e1570664fd1a98b20c187b78790fb5c656..5480dbf40c225da44c15d475d34cdebb0e1fbc30 100644 --- a/lib/kokkos/core/src/Kokkos_Concepts.hpp +++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp @@ -79,6 +79,21 @@ struct IndexType using type = T; }; +/**\brief Specify Launch Bounds for CUDA execution. + * + * The "best" defaults may be architecture specific. + */ +template< unsigned int maxT = 1024 /* Max threads per block */ + , unsigned int minB = 1 /* Min blocks per SM */ + > +struct LaunchBounds +{ + using launch_bounds = LaunchBounds; + using type = LaunchBounds<maxT,minB>; + static unsigned int constexpr maxTperB {maxT}; + static unsigned int constexpr minBperSM {minB}; +}; + } // namespace Kokkos //---------------------------------------------------------------------------- @@ -119,6 +134,7 @@ using Kokkos::is_array_layout ; KOKKOS_IMPL_IS_CONCEPT( iteration_pattern ) KOKKOS_IMPL_IS_CONCEPT( schedule_type ) KOKKOS_IMPL_IS_CONCEPT( index_type ) +KOKKOS_IMPL_IS_CONCEPT( launch_bounds ) } diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index 19de791c0ff118a434235cc1c44923ad103c3f70..ddb11d28944d2145dcf46c8bd1511d03732e0d3f 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -96,11 +96,13 @@ struct InitArguments { int num_numa; int device_id; - InitArguments() { - num_threads = -1; - num_numa = -1; - device_id = -1; - } + InitArguments( int nt = -1 + , int nn = -1 + , int dv = -1) + : num_threads( nt ) + , num_numa( nn ) + , device_id( dv ) + {} }; void initialize(int& narg, char* arg[]); @@ -168,6 +170,9 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size ) } // namespace Kokkos +#include <Kokkos_Crs.hpp> +#include <Kokkos_WorkGraphPolicy.hpp> + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index 09081d238710ac0673e544433e13cc58902e06b4..8c080f7a8fbd4cc35c39fd0ed601e4d456346001 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -51,6 +51,9 @@ #include <Kokkos_Macros.hpp> #include <impl/Kokkos_Utilities.hpp> +#include <Kokkos_UniqueToken.hpp> +#include <Kokkos_MasterLock.hpp> + //---------------------------------------------------------------------------- // Have assumed a 64bit build (8byte pointers) throughout the code base. diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp new file mode 100644 index 0000000000000000000000000000000000000000..93b3fa5ca9e479f1a5666b4776257e536e9e73d8 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -0,0 +1,333 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CRS_HPP +#define KOKKOS_CRS_HPP + +namespace Kokkos { +namespace Experimental { + +/// \class Crs +/// \brief Compressed row storage array. +/// +/// \tparam DataType The type of stored entries. If a Crs is +/// used as the graph of a sparse matrix, then this is usually an +/// integer type, the type of the column indices in the sparse +/// matrix. +/// +/// \tparam Arg1Type The second template parameter, corresponding +/// either to the Device type (if there are no more template +/// parameters) or to the Layout type (if there is at least one more +/// template parameter). +/// +/// \tparam Arg2Type The third template parameter, which if provided +/// corresponds to the Device type. +/// +/// \tparam SizeType The type of row offsets. Usually the default +/// parameter suffices. However, setting a nondefault value is +/// necessary in some cases, for example, if you want to have a +/// sparse matrices with dimensions (and therefore column indices) +/// that fit in \c int, but want to store more than <tt>INT_MAX</tt> +/// entries in the sparse matrix. +/// +/// A row has a range of entries: +/// <ul> +/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li> +/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li> +/// <li> <tt> entries( entry , i2 , i3 , ... ); </tt> </li> +/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li> +/// </ul> +template< class DataType, + class Arg1Type, + class Arg2Type = void, + typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type> +class Crs { +protected: + typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits; + +public: + typedef DataType data_type; + typedef typename traits::array_layout array_layout; + typedef typename traits::execution_space execution_space; + typedef typename traits::memory_space memory_space; + typedef typename traits::device_type device_type; + typedef SizeType size_type; + + typedef Crs< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type; + typedef Crs< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror; + typedef View<size_type* , array_layout, device_type> row_map_type; + typedef View<DataType* , array_layout, device_type> entries_type; + + entries_type entries; + row_map_type row_map; + + //! Construct an empty view. + Crs () : entries(), row_map() {} + + //! Copy constructor (shallow copy). + Crs (const Crs& rhs) : entries (rhs.entries), row_map (rhs.row_map) + {} + + template<class EntriesType, class RowMapType> + Crs (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_) + {} + + /** \brief Assign to a view of the rhs array. + * If the old view is the last view + * then allocated memory is deallocated. + */ + Crs& operator= (const Crs& rhs) { + entries = rhs.entries; + row_map = rhs.row_map; + return *this; + } + + /** \brief Destroy this view of the array. + * If the last view then allocated memory is deallocated. + */ + ~Crs() {} + + /** \brief Return number of rows in the graph + */ + KOKKOS_INLINE_FUNCTION + size_type numRows() const { + return (row_map.dimension_0 () != 0) ? + row_map.dimension_0 () - static_cast<size_type> (1) : + static_cast<size_type> (0); + } +}; + +/*--------------------------------------------------------------------------*/ + +template< class OutCounts, + class DataType, + class Arg1Type, + class Arg2Type, + class SizeType> +void get_crs_transpose_counts( + OutCounts& out, + Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in, + std::string const& name = "transpose_counts"); + +template< class OutCounts, + class InCrs> +void get_crs_row_map_from_counts( + OutCounts& out, + InCrs const& in, + std::string const& name = "row_map"); + +template< class DataType, + class Arg1Type, + class Arg2Type, + class SizeType> +void transpose_crs( + Crs<DataType, Arg1Type, Arg2Type, SizeType>& out, + Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in); + +}} // namespace Kokkos::Experimental + +/*--------------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +namespace Experimental { + +template <class InCrs, class OutCounts> +class GetCrsTransposeCounts { + public: + using execution_space = typename InCrs::execution_space; + using self_type = GetCrsTransposeCounts<InCrs, OutCounts>; + using index_type = typename InCrs::size_type; + private: + InCrs in; + OutCounts out; + public: + KOKKOS_INLINE_FUNCTION + void operator()(index_type i) const { + atomic_increment( &out[in.entries(i)] ); + } + GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out): + in(arg_in),out(arg_out) { + using policy_type = RangePolicy<index_type, execution_space>; + using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; + const closure_type closure(*this, policy_type(0, index_type(in.entries.size()))); + closure.execute(); + execution_space::fence(); + } +}; + +template <class InCounts, class OutRowMap> +class CrsRowMapFromCounts { + public: + using execution_space = typename InCounts::execution_space; + using value_type = typename OutRowMap::value_type; + using index_type = typename InCounts::size_type; + private: + InCounts in; + OutRowMap out; + public: + KOKKOS_INLINE_FUNCTION + void operator()(index_type i, value_type& update, bool final_pass) const { + update += in(i); + if (final_pass) { + out(i + 1) = update; + if (i == 0) { + out(0) = 0; + } + } + } + KOKKOS_INLINE_FUNCTION + void init(value_type& update) const { update = 0; } + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& update, const volatile value_type& input) const { + update += input; + } + using self_type = CrsRowMapFromCounts<InCounts, OutRowMap>; + CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out): + in(arg_in),out(arg_out) { + using policy_type = RangePolicy<index_type, execution_space>; + using closure_type = Kokkos::Impl::ParallelScan<self_type, policy_type>; + closure_type closure(*this, policy_type(0, in.size())); + closure.execute(); + execution_space::fence(); + } +}; + +template <class InCrs, class OutCrs> +class FillCrsTransposeEntries { + public: + using execution_space = typename InCrs::execution_space; + using memory_space = typename InCrs::memory_space; + using value_type = typename OutCrs::entries_type::value_type; + using index_type = typename InCrs::size_type; + private: + using counters_type = View<index_type*, memory_space>; + InCrs in; + OutCrs out; + counters_type counters; + public: + KOKKOS_INLINE_FUNCTION + void operator()(index_type i) const { + auto begin = in.row_map(i); + auto end = in.row_map(i + 1); + for (auto j = begin; j < end; ++j) { + auto ti = in.entries(j); + auto tbegin = out.row_map(ti); + auto tj = atomic_fetch_add( &counters(ti), 1 ); + out.entries( tbegin + tj ) = i; + } + } + using self_type = FillCrsTransposeEntries<InCrs, OutCrs>; + FillCrsTransposeEntries(InCrs const& arg_in, OutCrs const& arg_out): + in(arg_in),out(arg_out), + counters("counters", arg_out.numRows()) { + using policy_type = RangePolicy<index_type, execution_space>; + using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; + const closure_type closure(*this, policy_type(0, index_type(in.numRows()))); + closure.execute(); + execution_space::fence(); + } +}; + +}}} // namespace Kokkos::Impl::Experimental + +/*--------------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Experimental { + +template< class OutCounts, + class DataType, + class Arg1Type, + class Arg2Type, + class SizeType> +void get_crs_transpose_counts( + OutCounts& out, + Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in, + std::string const& name) { + using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>; + out = OutCounts(name, in.numRows()); + Kokkos::Impl::Experimental:: + GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out); +} + +template< class OutRowMap, + class InCounts> +void get_crs_row_map_from_counts( + OutRowMap& out, + InCounts const& in, + std::string const& name) { + out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1); + Kokkos::Impl::Experimental:: + CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out); +} + +template< class DataType, + class Arg1Type, + class Arg2Type, + class SizeType> +void transpose_crs( + Crs<DataType, Arg1Type, Arg2Type, SizeType>& out, + Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in) +{ + typedef Crs<DataType, Arg1Type, Arg2Type, SizeType> crs_type ; + typedef typename crs_type::memory_space memory_space ; + typedef View<SizeType*, memory_space> counts_type ; + { + counts_type counts; + Kokkos::Experimental::get_crs_transpose_counts(counts, in); + Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts, + "tranpose_row_map"); + } + out.entries = decltype(out.entries)("transpose_entries", in.entries.size()); + Kokkos::Impl::Experimental:: + FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out); +} + +}} // namespace Kokkos::Experimental + +#endif /* #define KOKKOS_CRS_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp index f0f0f874580b9bbe028d5fdb13183ea5211aea78..197831dee5b79f53116f2f7524a43e1e98c85f45 100644 --- a/lib/kokkos/core/src/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp @@ -217,8 +217,8 @@ public: private: - cudaStream_t m_stream ; int m_device ; + cudaStream_t m_stream ; }; } // namespace Kokkos @@ -295,6 +295,7 @@ struct VerifyExecutionCanAccessMemorySpace #include <Cuda/Kokkos_Cuda_Team.hpp> #include <Cuda/Kokkos_Cuda_Parallel.hpp> #include <Cuda/Kokkos_Cuda_Task.hpp> +#include <Cuda/Kokkos_Cuda_UniqueToken.hpp> #include <KokkosExp_MDRangePolicy.hpp> //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp index 307ab193b187f51c2e6fa3886c221c106f19b6fa..fb5985e164e0c61a34e2f32192cc7e5d2ecbde05 100644 --- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp @@ -90,7 +90,7 @@ public: , const size_t arg_alloc_size ) const ; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return m_name; } /*--------------------------------*/ /** \brief Error reporting for HostSpace attempt to access CudaSpace */ @@ -186,7 +186,7 @@ public: , const size_t arg_alloc_size ) const ; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return m_name; } /*--------------------------------*/ @@ -234,7 +234,7 @@ public: , const size_t arg_alloc_size ) const ; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return m_name; } private: diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index 375a2d37440f0545c596edbb074a8d5035de8ce8..a8c4d77c62c2dc708f175b92e86ba13d3ff3fa63 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -384,6 +384,7 @@ Impl::PerThreadValue PerThread(const int& arg); * WorkTag (none): Tag which is used as the first argument for the functor operator. * Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static). * IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space. + * LaunchBounds<int,int> (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation. */ template< class ... Properties> class TeamPolicy: public @@ -561,6 +562,45 @@ KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange( const TeamMemberType&, const iType& count ); +#if defined(KOKKOS_ENABLE_PROFILING) +namespace Impl { + +template<typename FunctorType, typename TagType, + bool HasTag = !std::is_same<TagType, void>::value > +struct ParallelConstructName; + +template<typename FunctorType, typename TagType> +struct ParallelConstructName<FunctorType, TagType, true> { + ParallelConstructName(std::string const& label):label_ref(label) { + if (label.empty()) { + default_name = std::string(typeid(FunctorType).name()) + "/" + + typeid(TagType).name(); + } + } + std::string const& get() { + return (label_ref.empty()) ? default_name : label_ref; + } + std::string const& label_ref; + std::string default_name; +}; + +template<typename FunctorType, typename TagType> +struct ParallelConstructName<FunctorType, TagType, false> { + ParallelConstructName(std::string const& label):label_ref(label) { + if (label.empty()) { + default_name = std::string(typeid(FunctorType).name()); + } + } + std::string const& get() { + return (label_ref.empty()) ? default_name : label_ref; + } + std::string const& label_ref; + std::string default_name; +}; + +} // namespace Impl +#endif /* defined KOKKOS_ENABLE_PROFILING */ + } // namespace Kokkos #endif /* #define KOKKOS_EXECPOLICY_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp index e224cd4e84a03df491ed0440c0ffcd5d5f1b3491..9c9af0dd8b8624b98ebfd2fbcefc8bfa613c387f 100644 --- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp @@ -126,14 +126,6 @@ public: //! This memory space preferred device_type typedef Kokkos::Device< execution_space, memory_space > device_type; - /*--------------------------------*/ - /* Functions unique to the HBWSpace */ - static int in_parallel(); - - static void register_in_parallel( int (*)() ); - - /*--------------------------------*/ - /**\brief Default memory space instance */ HBWSpace(); HBWSpace( const HBWSpace & rhs ) = default; diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp index d00cce8f608ee261dda128206cb2a182633b492f..431635047a9ea0d65f01290684a59bcf01906159 100644 --- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -130,14 +130,6 @@ public: //! This memory space preferred device_type typedef Kokkos::Device< execution_space, memory_space > device_type; - /*--------------------------------*/ - /* Functions unique to the HostSpace */ - static int in_parallel(); - - static void register_in_parallel( int (*)() ); - - /*--------------------------------*/ - /**\brief Default memory space instance */ HostSpace(); HostSpace( HostSpace && rhs ) = default; @@ -161,7 +153,7 @@ public: , const size_t arg_alloc_size ) const; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return m_name; } private: AllocationMechanism m_alloc_mech; diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp index f300a6d9f694488980d16895293333ce99ca8f62..87c705153ea35419513978ecb938b9f6f9073043 100644 --- a/lib/kokkos/core/src/Kokkos_Layout.hpp +++ b/lib/kokkos/core/src/Kokkos_Layout.hpp @@ -156,6 +156,8 @@ struct LayoutStride { for ( int r = 0 ; r < ARRAY_LAYOUT_MAX_RANK ; ++r ) { tmp.dimension[r] = 0 ; tmp.stride[r] = 0 ; + } + for ( int r = 0 ; r < rank ; ++r ) { check_input &= ~int( 1 << order[r] ); } if ( 0 == check_input ) { diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index 1439dbd3f85f6b02dfd0273420c572e93bc0b3e5..250ef6630ad6ec294782121e2eab866e89b434d9 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -297,6 +297,10 @@ #endif #endif + #if defined( KOKKOS_ARCH_AVX512MIC ) + #define KOKKOS_ENABLE_RFO_PREFETCH 1 + #endif + #if defined( __MIC__ ) // Compiling for Xeon Phi #endif @@ -344,13 +348,18 @@ //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 //#define KOKKOS_ENABLE_PRAGMA_SIMD 1 + #if defined( KOKKOS_ARCH_AVX512MIC ) + #define KOKKOS_ENABLE_RFO_PREFETCH 1 + #endif + #if !defined( KOKKOS_FORCEINLINE_FUNCTION ) #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline)) #endif #if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \ ( defined( __amd64 ) || defined( __amd64__ ) || \ - defined( __x86_64 ) || defined( __x86_64__ ) ) + defined( __x86_64 ) || defined( __x86_64__ ) || \ + defined(__PPC64__) ) #define KOKKOS_ENABLE_ASM 1 #endif #endif diff --git a/lib/kokkos/core/src/Kokkos_MasterLock.hpp b/lib/kokkos/core/src/Kokkos_MasterLock.hpp new file mode 100644 index 0000000000000000000000000000000000000000..81564b8eac5c7326c0f1bf5786531102c5d8edc5 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_MasterLock.hpp @@ -0,0 +1,73 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_MASTER_LOCK_HPP +#define KOKKOS_MASTER_LOCK_HPP + +#include <Kokkos_Macros.hpp> + +namespace Kokkos { namespace Experimental { + +// my be used to coordinate work between master instances +// SHOULD NOT be used within a parallel algorithm +// +// This lock should be used with with a scoped lock guard +// i.e. std::unique_lock<Lock>, std::lock_guard +// +// cannot be copied or moved +// has the following functions available +// +// Lock() +// ~Lock() +// +// void lock() +// void unlock() +// bool try_lock() +// +template <typename ExecutionSpace> +class MasterLock; + +}} // namespace Kokkos::Experimental + +#endif //KOKKOS_MASTER_LOCK_HPP + diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp index dbf1ad8057fe87ecab80f3d29eed98952be19520..1da936067d59e269ce43fb6a9e65cfe16220cc2e 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -66,11 +66,6 @@ private: enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 }; enum : uint32_t { max_bit_count = CB::max_bit_count }; - /* Defaults for min block, max block, and superblock sizes */ - enum : uint32_t { MIN_BLOCK_SIZE_LG2 = 6 /* 64 bytes */ }; - enum : uint32_t { MAX_BLOCK_SIZE_LG2 = 12 /* 4k bytes */ }; - enum : uint32_t { SUPERBLOCK_SIZE_LG2 = 16 /* 64k bytes */ }; - enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 }; /* Each superblock has a concurrent bitset state @@ -85,6 +80,14 @@ private: * is concurrently updated. */ + /* Mapping between block_size <-> block_state + * + * block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift + * block_size = m_sb_size_lg2 - ( block_state >> state_shift ) + * + * Thus A_block_size < B_block_size <=> A_block_state > B_block_state + */ + typedef typename DeviceType::memory_space base_memory_space ; enum { accessible = @@ -251,10 +254,10 @@ public: * significant runtime performance improvements. */ MemoryPool( const base_memory_space & memspace - , const size_t min_total_alloc_size - , const uint32_t min_block_alloc_size // = 1 << MIN_BLOCK_SIZE_LG2 - , const uint32_t max_block_alloc_size // = 1 << MAX_BLOCK_SIZE_LG2 - , const uint32_t min_superblock_size // = 1 << SUPERBLOCK_SIZE_LG2 + , const size_t min_total_alloc_size + , size_t min_block_alloc_size = 0 + , size_t max_block_alloc_size = 0 + , size_t min_superblock_size = 0 ) : m_tracker() , m_sb_state_array(0) @@ -267,8 +270,43 @@ public: , m_data_offset(0) , m_unused_padding(0) { - const uint32_t int_align_lg2 = 3 ; /* align as int[8] */ - const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ; + const uint32_t int_align_lg2 = 3 ; /* align as int[8] */ + const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ; + + // Constraints and defaults: + // min_block_alloc_size <= max_block_alloc_size + // max_block_alloc_size <= min_superblock_size + // min_superblock_size <= min_total_alloc_size + + const uint32_t MIN_BLOCK_SIZE = 1u << 6 /* 64 bytes */ ; + const uint32_t MAX_BLOCK_SIZE = 1u << 12 /* 4k bytes */ ; + + if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ; + + if ( 0 == max_block_alloc_size ) { + + max_block_alloc_size = MAX_BLOCK_SIZE ; + + // Upper bound of total allocation size + max_block_alloc_size = std::min( size_t(max_block_alloc_size) + , min_total_alloc_size ); + + // Lower bound of minimum block size + max_block_alloc_size = std::max( max_block_alloc_size + , min_block_alloc_size ); + } + + if ( 0 == min_superblock_size ) { + min_superblock_size = max_block_alloc_size ; + + // Upper bound of total allocation size + min_superblock_size = std::min( size_t(min_superblock_size) + , min_total_alloc_size ); + + // Lower bound of maximum block size + min_superblock_size = std::max( min_superblock_size + , max_block_alloc_size ); + } // Block and superblock size is power of two: @@ -435,6 +473,8 @@ public: void * allocate( size_t alloc_size , int32_t attempt_limit = 1 ) const noexcept { + if ( 0 == alloc_size ) return (void*) 0 ; + void * p = 0 ; const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size ); @@ -444,10 +484,9 @@ public: // Allocation will fit within a superblock // that has block sizes ( 1 << block_size_lg2 ) - const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ; - const uint32_t block_state = block_count_lg2 << state_shift ; - const uint32_t block_count = 1u << block_count_lg2 ; - const uint32_t block_count_mask = block_count - 1 ; + const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ; + const uint32_t block_state = block_count_lg2 << state_shift ; + const uint32_t block_count = 1u << block_count_lg2 ; // Superblock hints for this block size: // hint_sb_id_ptr[0] is the dynamically changing hint @@ -465,7 +504,7 @@ public: // the guess for which block within a superblock should // be claimed. If not available then a search occurs. - const uint32_t block_id_hint = block_count_mask & + const uint32_t block_id_hint = (uint32_t)( Kokkos::Impl::clock_tic() #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) // Spread out potentially concurrent access @@ -474,6 +513,9 @@ public: #endif ); + // expected state of superblock for allocation + uint32_t sb_state = block_state ; + int32_t sb_id = -1 ; volatile uint32_t * sb_state_array = 0 ; @@ -484,6 +526,8 @@ public: if ( sb_id < 0 ) { + // No superblock specified, try the hint for this block size + sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr ); sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size ); @@ -493,16 +537,20 @@ public: // 0 <= sb_id // sb_state_array == m_sb_state_array + m_sb_state_size * sb_id - if ( block_state == ( state_header_mask & *sb_state_array ) ) { + if ( sb_state == ( state_header_mask & *sb_state_array ) ) { + + // This superblock state is as expected, for the moment. + // Attempt to claim a bit. The attempt updates the state + // so have already made sure the state header is as expected. - // This superblock state is assigned to this block size. - // Try to claim a bit. + const uint32_t count_lg2 = sb_state >> state_shift ; + const uint32_t mask = ( 1u << count_lg2 ) - 1 ; const Kokkos::pair<int,int> result = CB::acquire_bounded_lg2( sb_state_array - , block_count_lg2 - , block_id_hint - , block_state + , count_lg2 + , block_id_hint & mask + , sb_state ); // If result.first < 0 then failed to acquire @@ -512,16 +560,18 @@ public: if ( 0 <= result.first ) { // acquired a bit + const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ; + // Set the allocated block pointer p = ((char*)( m_sb_state_array + m_data_offset )) + ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory - + ( result.first << block_size_lg2 ); // block memory + + ( result.first << size_lg2 ); // block memory break ; // Success } -// printf(" acquire block_count_lg2(%d) block_state(0x%x) sb_id(%d) result(%d,%d)\n" , block_count_lg2 , block_state , sb_id , result.first , result.second ); +// printf(" acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second ); } //------------------------------------------------------------------ @@ -529,12 +579,18 @@ public: // Must find a new superblock. // Start searching at designated index for this block size. - // Look for a partially full superblock of this block size. - // Look for an empty superblock just in case cannot find partfull. + // Look for superblock that, in preferential order, + // 1) part-full superblock of this block size + // 2) empty superblock to claim for this block size + // 3) part-full superblock of the next larger block size + sb_state = block_state ; // Expect to find the desired state sb_id = -1 ; + bool update_hint = false ; int32_t sb_id_empty = -1 ; + int32_t sb_id_large = -1 ; + uint32_t sb_state_large = 0 ; sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ; @@ -544,39 +600,55 @@ public: // Note that the state may change at any moment // as concurrent allocations and deallocations occur. - const uint32_t state = *sb_state_array ; - const uint32_t used = state & state_used_mask ; + const uint32_t full_state = *sb_state_array ; + const uint32_t used = full_state & state_used_mask ; + const uint32_t state = full_state & state_header_mask ; - if ( block_state == ( state & state_header_mask ) ) { + if ( state == block_state ) { // Superblock is assigned to this block size - if ( used < block_count ) { + if ( used < block_count ) { // There is room to allocate one block sb_id = id ; - if ( used + 1 < block_count ) { - - // There is room to allocate more than one block + // Is there room to allocate more than one block? - Kokkos::atomic_compare_exchange - ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) ); - } + update_hint = used + 1 < block_count ; break ; } } - else if ( ( used == 0 ) && ( sb_id_empty == -1 ) ) { + else if ( 0 == used ) { + + // Superblock is empty + + if ( -1 == sb_id_empty ) { - // Superblock is not assigned to this block size - // and is the first empty superblock encountered. - // Save this id to use if a partfull superblock is not found. + // Superblock is not assigned to this block size + // and is the first empty superblock encountered. + // Save this id to use if a partfull superblock is not found. - sb_id_empty = id ; + sb_id_empty = id ; + } + } + else if ( ( -1 == sb_id_empty /* have not found an empty */ ) && + ( -1 == sb_id_large /* have not found a larger */ ) && + ( state < block_state /* a larger block */ ) && + // is not full: + ( used < ( 1u << ( state >> state_shift ) ) ) ) { + // First superblock encountered that is + // larger than this block size and + // has room for an allocation. + // Save this id to use of partfull or empty superblock not found + sb_id_large = id ; + sb_state_large = state ; } + // Iterate around the superblock array: + if ( ++id < m_sb_count ) { sb_state_array += m_sb_state_size ; } @@ -586,7 +658,7 @@ public: } } -// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d)\n" , m_sb_count , sb_id , sb_id_empty ); +// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large); if ( sb_id < 0 ) { @@ -609,21 +681,31 @@ public: const uint32_t state_empty = state_header_mask & *sb_state_array ; - if ( state_empty == - Kokkos::atomic_compare_exchange - (sb_state_array,state_empty,block_state) ) { + // If this thread claims the empty block then update the hint + update_hint = + state_empty == + Kokkos::atomic_compare_exchange + (sb_state_array,state_empty,block_state); + } + else if ( 0 <= sb_id_large ) { - // If this thread claimed the block then update the hint + // Found a larger superblock with space available - Kokkos::atomic_compare_exchange - ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) ); - } + sb_id = sb_id_large ; + sb_state = sb_state_large ; + + sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size ); } else { // Did not find a potentially usable superblock --attempt_limit ; } } + + if ( update_hint ) { + Kokkos::atomic_compare_exchange + ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) ); + } } // end allocation attempt loop //-------------------------------------------------------------------- @@ -646,6 +728,8 @@ public: KOKKOS_INLINE_FUNCTION void deallocate( void * p , size_t /* alloc_size */ ) const noexcept { + if ( 0 == p ) return ; + // Determine which superblock and block const ptrdiff_t d = ((char*)p) - ((char*)( m_sb_state_array + m_data_offset )); diff --git a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp index 94b58b8affe1921f2bfa9faf1e25b3dc303c5220..af9c8ea782ca3395ac608cb2e9c83366e17c8a30 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp @@ -72,11 +72,11 @@ struct MemoryTraits { //! Tag this class as a kokkos memory traits: typedef MemoryTraits memory_traits ; - enum { Unmanaged = T & unsigned(Kokkos::Unmanaged) }; - enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) }; - enum { Atomic = T & unsigned(Kokkos::Atomic) }; - enum { Restrict = T & unsigned(Kokkos::Restrict) }; - enum { Aligned = T & unsigned(Kokkos::Aligned) }; + enum : bool { Unmanaged = (unsigned(0) != (T & unsigned(Kokkos::Unmanaged))) }; + enum : bool { RandomAccess = (unsigned(0) != (T & unsigned(Kokkos::RandomAccess))) }; + enum : bool { Atomic = (unsigned(0) != (T & unsigned(Kokkos::Atomic))) }; + enum : bool { Restrict = (unsigned(0) != (T & unsigned(Kokkos::Restrict))) }; + enum : bool { Aligned = (unsigned(0) != (T & unsigned(Kokkos::Aligned))) }; }; @@ -109,7 +109,11 @@ enum { MEMORY_ALIGNMENT = #else ( 1 << Kokkos::Impl::integral_power_of_two( 128 ) ) #endif - , MEMORY_ALIGNMENT_THRESHOLD = 4 +#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD ) + , MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD +#else + , MEMORY_ALIGNMENT_THRESHOLD = 4 +#endif }; diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp index 3e11621ce6a30700bd1710a1a013b3c2586296c6..d5de01cf2f83e65497c0a692cf96aae26b2aacc6 100644 --- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp +++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp @@ -47,10 +47,6 @@ #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ENABLE_OPENMP) -#if !defined(_OPENMP) -#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!" -#endif - #include <Kokkos_Core_fwd.hpp> #include <cstddef> @@ -67,95 +63,144 @@ #include <Kokkos_Layout.hpp> #include <impl/Kokkos_Tags.hpp> +#include <vector> + /*--------------------------------------------------------------------------*/ namespace Kokkos { +namespace Impl { +class OpenMPExec; +} + /// \class OpenMP /// \brief Kokkos device for multicore processors in the host memory space. class OpenMP { public: - //------------------------------------ - //! \name Type declarations that all Kokkos devices must provide. - //@{ - //! Tag this class as a kokkos execution space using execution_space = OpenMP; + + using memory_space = #ifdef KOKKOS_ENABLE_HBWSPACE - using memory_space = Experimental::HBWSpace; + Experimental::HBWSpace; #else - using memory_space = HostSpace; + HostSpace; #endif - //! This execution space preferred device_type - using device_type = Kokkos::Device<execution_space,memory_space>; - - using array_layout = LayoutRight; - using size_type = memory_space::size_type; + //! This execution space preferred device_type + using device_type = Kokkos::Device< execution_space, memory_space >; + using array_layout = LayoutRight; + using size_type = memory_space::size_type; using scratch_memory_space = ScratchMemorySpace< OpenMP >; - //@} - //------------------------------------ - //! \name Functions that all Kokkos execution spaces must implement. - //@{ - - inline static bool in_parallel(); - - /** \brief Set the device in a "sleep" state. A noop for OpenMP. */ - static bool sleep(); - - /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */ - static bool wake(); + /// \brief Get a handle to the default execution space instance + inline + OpenMP() noexcept; + + // Using omp_get_max_threads(); is problematic + // On Intel (essentially an initial call to the OpenMP runtime + // without a parallel region before will set a process mask for a single core + // The runtime will than bind threads for a parallel region to other cores on the + // entering the first parallel region and make the process mask the aggregate of + // the thread masks. The intend seems to be to make serial code run fast, if you + // compile with OpenMP enabled but don't actually use parallel regions or so + // static int omp_max_threads = omp_get_max_threads(); + static int get_current_max_threads() noexcept; + + /// \brief Initialize the default execution space + /// + /// if ( thread_count == -1 ) + /// then use the number of threads that openmp defaults to + /// if ( thread_count == 0 && Kokkos::hwlow_available() ) + /// then use hwloc to choose the number of threads and change + /// the default number of threads + /// if ( thread_count > 0 ) + /// then force openmp to use the given number of threads and change + /// the default number of threads + static void initialize( int thread_count = -1 ); + + /// \brief Free any resources being consumed by the default execution space + static void finalize(); - /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ - static void fence() {} + /// \brief is the default execution space initialized for current 'master' thread + static bool is_initialized() noexcept; /// \brief Print configuration information to the given output stream. - static void print_configuration( std::ostream & , const bool detail = false ); + static void print_configuration( std::ostream & , const bool verbose = false ); + + /// \brief is the instance running a parallel algorithm + inline + static bool in_parallel( OpenMP const& = OpenMP() ) noexcept; + + /// \brief Wait until all dispatched functors complete on the given instance + /// + /// This is a no-op on OpenMP + inline + static void fence( OpenMP const& = OpenMP() ) noexcept; + + /// \brief Does the given instance return immediately after launching + /// a parallel algorithm + /// + /// This always returns false on OpenMP + inline + static bool is_asynchronous( OpenMP const& = OpenMP() ) noexcept; + + + /// \brief Partition the default instance into new instances without creating + /// new masters + /// + /// This is a no-op on OpenMP since the default instance cannot be partitioned + /// without promoting other threads to 'master' + static std::vector<OpenMP> partition(...); + + /// Non-default instances should be ref-counted so that when the last + /// is destroyed the instance resources are released + /// + /// This is a no-op on OpenMP since a non default instance cannot be created + static OpenMP create_instance(...); + + /// \brief Partition the default instance and call 'f' on each new 'master' thread + /// + /// Func is a functor with the following signiture + /// void( int partition_id, int num_partitions ) + template <typename F> + static void partition_master( F const& f + , int requested_num_partitions = 0 + , int requested_partition_size = 0 + ); + + inline + static int thread_pool_size() noexcept; - /// \brief Free any resources being consumed by the device. - static void finalize(); + /** \brief The rank of the executing thread in this thread pool */ + KOKKOS_INLINE_FUNCTION + static int thread_pool_rank() noexcept; - /** \brief Initialize the device. - * - * 1) If the hardware locality library is enabled and OpenMP has not - * already bound threads then bind OpenMP threads to maximize - * core utilization and group for memory hierarchy locality. - * - * 2) Allocate a HostThread for each OpenMP thread to hold its - * topology and fan in/out data. - */ - static void initialize( unsigned thread_count = 0 , - unsigned use_numa_count = 0 , - unsigned use_cores_per_numa = 0 ); - - static int is_initialized(); - - /** \brief Return the maximum amount of concurrency. */ - static int concurrency(); +#if !defined( KOKKOS_DISABLE_DEPRECATED ) + /// \brief Initialize the default execution space + static void initialize( int thread_count, + int use_numa_count, + int use_cores_per_numa = 0); - //@} - //------------------------------------ - /** \brief This execution space has a topological thread pool which can be queried. - * - * All threads within a pool have a common memory space for which they are cache coherent. - * depth = 0 gives the number of threads in the whole pool. - * depth = 1 gives the number of threads in a NUMA region, typically sharing L3 cache. - * depth = 2 gives the number of threads at the finest granularity, typically sharing L1 cache. - */ - inline static int thread_pool_size( int depth = 0 ); + inline + static int thread_pool_size( int depth ); - /** \brief The rank of the executing thread in this thread pool */ - KOKKOS_INLINE_FUNCTION static int thread_pool_rank(); + static void sleep() {}; + static void wake() {}; - //------------------------------------ + // use UniqueToken + static int concurrency(); - inline static unsigned max_hardware_threads() { return thread_pool_size(0); } + // use UniqueToken + inline + static int max_hardware_threads() noexcept; - KOKKOS_INLINE_FUNCTION static - unsigned hardware_thread_id() { return thread_pool_rank(); } + // use UniqueToken + KOKKOS_INLINE_FUNCTION + static int hardware_thread_id() noexcept; +#endif - static const char* name(); + static constexpr const char* name() noexcept { return "OpenMP"; } }; } // namespace Kokkos @@ -195,6 +240,7 @@ struct VerifyExecutionCanAccessMemorySpace /*--------------------------------------------------------------------------*/ #include <OpenMP/Kokkos_OpenMP_Exec.hpp> +#include <OpenMP/Kokkos_OpenMP_Team.hpp> #include <OpenMP/Kokkos_OpenMP_Parallel.hpp> #include <OpenMP/Kokkos_OpenMP_Task.hpp> diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp index e412e608b28ca52f7d7888ea5fc37af721c5b10c..fc8d6bec812199dcdab12928e97ddc4b9e5440ca 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp @@ -177,22 +177,23 @@ void parallel_for( const ExecPolicy & policy ) { #if defined(KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } + uint64_t kpID = 0; + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecPolicy::work_tag> name(str); + Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID); + } #endif - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); + Kokkos::Impl::shared_allocation_tracking_disable(); Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); + Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); #if defined(KOKKOS_ENABLE_PROFILING) - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } #endif } @@ -210,14 +211,15 @@ void parallel_for( const size_t work_count #if defined(KOKKOS_ENABLE_PROFILING) uint64_t kpID = 0; - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str); + Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID); + } #endif - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); + Kokkos::Impl::shared_allocation_tracking_disable(); Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); + Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); @@ -420,21 +422,22 @@ void parallel_scan( const ExecutionPolicy & policy { #if defined(KOKKOS_ENABLE_PROFILING) uint64_t kpID = 0; - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecutionPolicy::work_tag> name(str); + Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID); + } #endif - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); + Kokkos::Impl::shared_allocation_tracking_disable(); Impl::ParallelScan< FunctorType , ExecutionPolicy > closure( functor , policy ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); + Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); #if defined(KOKKOS_ENABLE_PROFILING) - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelScan(kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelScan(kpID); + } #endif } @@ -453,21 +456,22 @@ void parallel_scan( const size_t work_count #if defined(KOKKOS_ENABLE_PROFILING) uint64_t kpID = 0; - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str); + Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID); + } #endif - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); + Kokkos::Impl::shared_allocation_tracking_disable(); Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) ); - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); + Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); #if defined(KOKKOS_ENABLE_PROFILING) - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelScan(kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelScan(kpID); + } #endif } diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index 8ea5183e353dca7e2ccfc1616338549d2ab7c206..9df6d4ba097d62191214e27578c656bf21ea70a5 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -872,13 +872,14 @@ namespace Impl { const FunctorType& functor, ReturnType& return_value) { #if defined(KOKKOS_ENABLE_PROFILING) - uint64_t kpID = 0; - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID); - } + uint64_t kpID = 0; + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName<FunctorType, typename PolicyType::work_tag> name(label); + Kokkos::Profiling::beginParallelReduce(name.get(), 0, &kpID); + } #endif - Kokkos::Impl::shared_allocation_tracking_claim_and_disable(); + Kokkos::Impl::shared_allocation_tracking_disable(); #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER Impl::ParallelReduce<typename functor_adaptor::functor_type, PolicyType, typename return_value_adapter::reducer_type > closure(functor_adaptor::functor(functor), @@ -890,13 +891,13 @@ namespace Impl { policy, return_value_adapter::return_value(return_value,functor)); #endif - Kokkos::Impl::shared_allocation_tracking_release_and_enable(); + Kokkos::Impl::shared_allocation_tracking_enable(); closure.execute(); #if defined(KOKKOS_ENABLE_PROFILING) - if(Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelReduce(kpID); - } + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelReduce(kpID); + } #endif } diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp index 73e8ae303010b6f76ca2465059fe1c86ff843262..539761a1f9373252a47154ddbd8dda370d08e960 100644 --- a/lib/kokkos/core/src/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Kokkos_Serial.hpp @@ -66,6 +66,7 @@ #include <KokkosExp_MDRangePolicy.hpp> +#include <Kokkos_UniqueToken.hpp> namespace Kokkos { @@ -526,6 +527,7 @@ public: } }; + /*--------------------------------------------------------------------------*/ template< class FunctorType , class ... Traits > @@ -604,6 +606,178 @@ public: {} }; +} // namespace Impl +} // namespace Kokkos + + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ +/* Parallel patterns for Kokkos::Serial with MDRangePolicy */ + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType , + Kokkos::Experimental::MDRangePolicy< Traits ... > , + Kokkos::Serial + > +{ +private: + + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + + typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; + + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; + + void + exec() const + { + const typename Policy::member_type e = m_policy.end(); + for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) { + iterate_type( m_mdr_policy, m_functor )( i ); + } + } + +public: + + inline + void execute() const + { this->exec(); } + + inline + ParallelFor( const FunctorType & arg_functor + , const MDRangePolicy & arg_policy ) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + {} +}; + + +template< class FunctorType , class ReducerType , class ... Traits > +class ParallelReduce< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ... > + , ReducerType + , Kokkos::Serial + > +{ +private: + + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + + typedef typename MDRangePolicy::work_tag WorkTag ; + + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef typename ReducerTypeFwd::value_type ValueType; + + typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ; + + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; + + typedef typename Analysis::pointer_type pointer_type ; + typedef typename Analysis::reference_type reference_type ; + + + using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy + , FunctorType + , WorkTag + , ValueType + >; + + + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; + const ReducerType m_reducer ; + const pointer_type m_result_ptr ; + + inline + void + exec( reference_type update ) const + { + const typename Policy::member_type e = m_policy.end(); + for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) { + iterate_type( m_mdr_policy, m_functor, update )( i ); + } + } + +public: + + inline + void execute() const + { + const size_t pool_reduce_size = + Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) ); + const size_t team_reduce_size = 0 ; // Never shrinks + const size_t team_shared_size = 0 ; // Never shrinks + const size_t thread_local_size = 0 ; // Never shrinks + + serial_resize_thread_team_data( pool_reduce_size + , team_reduce_size + , team_shared_size + , thread_local_size ); + + HostThreadTeamData & data = *serial_get_thread_team_data(); + + pointer_type ptr = + m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local()); + + reference_type update = + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr ); + + this-> exec( update ); + + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >:: + final( ReducerConditional::select(m_functor , m_reducer) , ptr ); + } + + template< class HostViewType > + ParallelReduce( const FunctorType & arg_functor , + const MDRangePolicy & arg_policy , + const HostViewType & arg_result_view , + typename std::enable_if< + Kokkos::is_view< HostViewType >::value && + !Kokkos::is_reducer_type<ReducerType>::value + ,void*>::type = NULL) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( InvalidType() ) + , m_result_ptr( arg_result_view.data() ) + { + static_assert( Kokkos::is_view< HostViewType >::value + , "Kokkos::Serial reduce result must be a View" ); + + static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value + , "Kokkos::Serial reduce result must be a View in HostSpace" ); + } + + inline + ParallelReduce( const FunctorType & arg_functor + , MDRangePolicy arg_policy + , const ReducerType& reducer ) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( reducer ) + , m_result_ptr( reducer.view().data() ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } +}; + + + } // namespace Impl } // namespace Kokkos @@ -819,6 +993,60 @@ public: /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ +namespace Kokkos { namespace Experimental { + +template<> +class UniqueToken< Serial, UniqueTokenScope::Instance> +{ +public: + using execution_space = Serial; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + inline + int size() const noexcept { return 1; } + + /// \brief acquire value such that 0 <= value < size() + inline + int acquire() const noexcept { return 0; } + + /// \brief release a value acquired by generate + inline + void release( int ) const noexcept {} +}; + +template<> +class UniqueToken< Serial, UniqueTokenScope::Global> +{ +public: + using execution_space = Serial; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + inline + int size() const noexcept { return 1; } + + /// \brief acquire value such that 0 <= value < size() + inline + int acquire() const noexcept { return 0; } + + /// \brief release a value acquired by generate + inline + void release( int ) const noexcept {} +}; + +}} // namespace Kokkos::Experimental + #include <impl/Kokkos_Serial_Task.hpp> #endif // defined( KOKKOS_ENABLE_SERIAL ) diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp index 7edda7aa754ff89d0099ae6c386b4350bc5e57f3..fcfc91a4eeb8bd0e92506a85ec2989d49f81ef54 100644 --- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp +++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp @@ -148,7 +148,7 @@ private: typename std::conditional< Arg2_is_space , Arg2 , void >::type >::type ; - using task_base = Impl::TaskBase< Space , ValueType , void > ; + using task_base = Impl::TaskBase< void , void , void > ; using queue_type = Impl::TaskQueue< Space > ; task_base * m_task ; @@ -293,13 +293,17 @@ public: //---------------------------------------- KOKKOS_INLINE_FUNCTION - typename task_base::get_return_type + int is_ready() const noexcept + { return ( 0 == m_task ) || ( ((task_base*) task_base::LockTag) == m_task->m_wait ); } + + KOKKOS_INLINE_FUNCTION + const typename Impl::TaskResult< ValueType >::reference_type get() const { if ( 0 == m_task ) { Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()"); } - return m_task->get(); + return Impl::TaskResult< ValueType >::get( m_task ); } }; @@ -396,7 +400,7 @@ private: using track_type = Kokkos::Impl::SharedAllocationTracker ; using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ; - using task_base = Impl::TaskBase< ExecSpace , void , void > ; + using task_base = Impl::TaskBase< void , void , void > ; track_type m_track ; queue_type * m_queue ; @@ -464,29 +468,19 @@ public: KOKKOS_INLINE_FUNCTION memory_pool * memory() const noexcept - { return m_queue ? m_queue->m_memory : (memory_pool*) 0 ; } + { return m_queue ? &( m_queue->m_memory ) : (memory_pool*) 0 ; } //---------------------------------------- /**\brief Allocation size for a spawned task */ template< typename FunctorType > KOKKOS_FUNCTION size_t spawn_allocation_size() const - { - using task_type = Impl::TaskBase< execution_space - , typename FunctorType::value_type - , FunctorType > ; - - return m_queue->allocate_block_size( sizeof(task_type) ); - } + { return m_queue->template spawn_allocation_size< FunctorType >(); } /**\brief Allocation size for a when_all aggregate */ KOKKOS_FUNCTION size_t when_all_allocation_size( int narg ) const - { - using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ; - - return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) ); - } + { return m_queue->when_all_allocation_size( narg ); } //---------------------------------------- @@ -507,7 +501,7 @@ public: queue_type * const queue = arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : ( arg_policy.m_dependence.m_task - ? arg_policy.m_dependence.m_task->m_queue + ? static_cast<queue_type*>(arg_policy.m_dependence.m_task->m_queue) : (queue_type*) 0 ); if ( 0 == queue ) { @@ -530,8 +524,12 @@ public: future_type f ; // Allocate task from memory pool + + const size_t alloc_size = + queue->template spawn_allocation_size< FunctorType >(); + f.m_task = - reinterpret_cast< task_type * >(queue->allocate(sizeof(task_type))); + reinterpret_cast< task_type * >(queue->allocate(alloc_size) ); if ( f.m_task ) { @@ -539,15 +537,17 @@ public: // Reference count starts at two: // +1 for the matching decrement when task is complete // +1 for the future - new ( f.m_task ) - task_type( arg_function - , queue - , arg_policy.m_dependence.m_task /* dependence */ - , 2 /* reference count */ - , int(sizeof(task_type)) /* allocation size */ - , int(arg_policy.m_task_type) - , int(arg_policy.m_priority) - , std::move(arg_functor) ); + new ( f.m_task ) task_type( std::move(arg_functor) ); + + f.m_task->m_apply = arg_function ; + f.m_task->m_queue = queue ; + f.m_task->m_next = arg_policy.m_dependence.m_task ; + f.m_task->m_ref_count = 2 ; + f.m_task->m_alloc_size = alloc_size ; + f.m_task->m_task_type = arg_policy.m_task_type ; + f.m_task->m_priority = arg_policy.m_priority ; + + Kokkos::memory_fence(); // The dependence (if any) is processed immediately // within the schedule function, as such the dependence's @@ -586,6 +586,30 @@ public: // Postcondition: task is in Executing-Respawn state } + template< typename FunctorType > + KOKKOS_FUNCTION static + void + respawn( FunctorType * arg_self + , TaskScheduler const & + , TaskPriority const & arg_priority + ) + { + // Precondition: task is in Executing state + + using value_type = typename FunctorType::value_type ; + using task_type = Impl::TaskBase< execution_space + , value_type + , FunctorType > ; + + task_type * const task = static_cast< task_type * >( arg_self ); + + task->m_priority = static_cast<int>(arg_priority); + + task->add_dependence( (task_base*) 0 ); + + // Postcondition: task is in Executing-Respawn state + } + //---------------------------------------- /**\brief Return a future that is complete * when all input futures are complete. @@ -596,7 +620,7 @@ public: when_all( Future< A1 , A2 > const arg[] , int narg ) { using future_type = Future< execution_space > ; - using task_base = Kokkos::Impl::TaskBase< execution_space , void , void > ; + using task_base = Kokkos::Impl::TaskBase< void , void , void > ; future_type f ; @@ -610,9 +634,9 @@ public: // Increment reference count to track subsequent assignment. Kokkos::atomic_increment( &(t->m_ref_count) ); if ( queue == 0 ) { - queue = t->m_queue ; + queue = static_cast< queue_type * >( t->m_queue ); } - else if ( queue != t->m_queue ) { + else if ( queue != static_cast< queue_type * >( t->m_queue ) ) { Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" ); } } @@ -620,28 +644,34 @@ public: if ( queue != 0 ) { - size_t const size = sizeof(task_base) + narg * sizeof(task_base*); + size_t const alloc_size = queue->when_all_allocation_size( narg ); f.m_task = - reinterpret_cast< task_base * >( queue->allocate( size ) ); + reinterpret_cast< task_base * >( queue->allocate( alloc_size ) ); if ( f.m_task ) { // Reference count starts at two: // +1 to match decrement when task completes // +1 for the future - new( f.m_task ) task_base( queue - , 2 /* reference count */ - , size /* allocation size */ - , narg /* dependence count */ - ); + + new( f.m_task ) task_base(); + + f.m_task->m_queue = queue ; + f.m_task->m_ref_count = 2 ; + f.m_task->m_alloc_size = alloc_size ; + f.m_task->m_dep_count = narg ; + f.m_task->m_task_type = task_base::Aggregate ; // Assign dependences, reference counts were already incremented - task_base ** const dep = f.m_task->aggregate_dependences(); + task_base * volatile * const dep = + f.m_task->aggregate_dependences(); for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; } + Kokkos::memory_fence(); + queue->schedule_aggregate( f.m_task ); // this when_all may be processed at any moment } diff --git a/lib/kokkos/core/src/Kokkos_UniqueToken.hpp b/lib/kokkos/core/src/Kokkos_UniqueToken.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1ffb07a6dbd7d277ab6790d4ae625f56bd14e6bc --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_UniqueToken.hpp @@ -0,0 +1,88 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_UNIQUE_TOKEN_HPP +#define KOKKOS_UNIQUE_TOKEN_HPP + +#include <Kokkos_Macros.hpp> + +namespace Kokkos { namespace Experimental { + +enum class UniqueTokenScope : int +{ + Instance, + Global +}; + +/// \brief class to generate unique ids base on the required amount of concurrency +/// +/// This object should behave like a ref-counted object, so that when the last +/// instance is destroy resources are free if needed +template <typename ExecutionSpace, UniqueTokenScope = UniqueTokenScope::Instance > +class UniqueToken +{ +public: + using execution_space = ExecutionSpace; + using size_type = typename execution_space::size_type; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ); + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type size() const ; + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type acquire() const ; + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release( size_type ) const ; +}; + +}} // namespace Kokkos::Experimental + +#endif //KOKKOS_UNIQUE_TOKEN_HPP diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index 3312aa6a9677aeca5417856b4113b11337eb35cb..1754e4a8fb5999a6baf1ddb8eec22810f4ec5238 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -54,11 +54,14 @@ #include <Kokkos_MemoryTraits.hpp> #include <Kokkos_ExecPolicy.hpp> +#if defined(KOKKOS_ENABLE_PROFILING) +#include <impl/Kokkos_Profiling_Interface.hpp> +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { template< class DataType > @@ -73,16 +76,6 @@ struct ViewDataAnalysis ; template< class , class ... > class ViewMapping { public: enum { is_assignable = false }; }; -} /* namespace Impl */ -} /* namespace Experimental */ -} /* namespace Kokkos */ - -namespace Kokkos { -namespace Impl { - -using Kokkos::Experimental::Impl::ViewMapping ; -using Kokkos::Experimental::Impl::ViewDataAnalysis ; - } /* namespace Impl */ } /* namespace Kokkos */ @@ -1563,12 +1556,12 @@ namespace Kokkos { namespace Impl { inline -void shared_allocation_tracking_claim_and_disable() -{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_claim_and_disable(); } +void shared_allocation_tracking_disable() +{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_disable(); } inline -void shared_allocation_tracking_release_and_enable() -{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_release_and_enable(); } +void shared_allocation_tracking_enable() +{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_enable(); } } /* namespace Impl */ } /* namespace Kokkos */ @@ -1795,6 +1788,20 @@ void deep_copy if ( (void *) dst.data() != (void*) src.data() ) { +#if defined(KOKKOS_ENABLE_PROFILING) + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + Kokkos::Profiling::beginDeepCopy( + Kokkos::Profiling::SpaceHandle(dst_memory_space::name()), + dst.label(), + dst.data(), + Kokkos::Profiling::SpaceHandle(src_memory_space::name()), + src.label(), + src.data(), + nbytes); + } +#endif + // Concern: If overlapping views then a parallel copy will be erroneous. // ... @@ -1882,7 +1889,14 @@ void deep_copy else { Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation"); } - } + +#if defined(KOKKOS_ENABLE_PROFILING) + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endDeepCopy(); + } +#endif + + } // ( (void *) dst.data() != (void*) src.data() ) } } /* namespace Kokkos */ @@ -2249,6 +2263,82 @@ resize( Kokkos::View<T,P...> & v , static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only resize managed views" ); + // Fix #904 by checking dimensions before actually resizing. + // + // Rank is known at compile time, so hopefully the compiler will + // remove branches that are compile-time false. The upcoming "if + // constexpr" language feature would make this certain. + if (view_type::Rank == 1 && + n0 == static_cast<size_t> (v.extent(0))) { + return; + } + if (view_type::Rank == 2 && + n0 == static_cast<size_t> (v.extent(0)) && + n1 == static_cast<size_t> (v.extent(1))) { + return; + } + if (view_type::Rank == 3 && + n0 == static_cast<size_t> (v.extent(0)) && + n1 == static_cast<size_t> (v.extent(1)) && + n2 == static_cast<size_t> (v.extent(2))) { + return; + } + if (view_type::Rank == 4 && + n0 == static_cast<size_t> (v.extent(0)) && + n1 == static_cast<size_t> (v.extent(1)) && + n2 == static_cast<size_t> (v.extent(2)) && + n3 == static_cast<size_t> (v.extent(3))) { + return; + } + if (view_type::Rank == 5 && + n0 == static_cast<size_t> (v.extent(0)) && + n1 == static_cast<size_t> (v.extent(1)) && + n2 == static_cast<size_t> (v.extent(2)) && + n3 == static_cast<size_t> (v.extent(3)) && + n4 == static_cast<size_t> (v.extent(4))) { + return; + } + if (view_type::Rank == 6 && + n0 == static_cast<size_t> (v.extent(0)) && + n1 == static_cast<size_t> (v.extent(1)) && + n2 == static_cast<size_t> (v.extent(2)) && + n3 == static_cast<size_t> (v.extent(3)) && + n4 == static_cast<size_t> (v.extent(4)) && + n5 == static_cast<size_t> (v.extent(5))) { + return; + } + if (view_type::Rank == 7 && + n0 == static_cast<size_t> (v.extent(0)) && + n1 == static_cast<size_t> (v.extent(1)) && + n2 == static_cast<size_t> (v.extent(2)) && + n3 == static_cast<size_t> (v.extent(3)) && + n4 == static_cast<size_t> (v.extent(4)) && + n5 == static_cast<size_t> (v.extent(5)) && + n6 == static_cast<size_t> (v.extent(6))) { + return; + } + if (view_type::Rank == 8 && + n0 == static_cast<size_t> (v.extent(0)) && + n1 == static_cast<size_t> (v.extent(1)) && + n2 == static_cast<size_t> (v.extent(2)) && + n3 == static_cast<size_t> (v.extent(3)) && + n4 == static_cast<size_t> (v.extent(4)) && + n5 == static_cast<size_t> (v.extent(5)) && + n6 == static_cast<size_t> (v.extent(6)) && + n7 == static_cast<size_t> (v.extent(7))) { + return; + } + // If Kokkos ever supports Views of rank > 8, the above code won't + // be incorrect, because avoiding reallocation in resize() is just + // an optimization. + + // TODO (mfh 27 Jun 2017) If the old View has enough space but just + // different dimensions (e.g., if the product of the dimensions, + // including extra space for alignment, will not change), then + // consider just reusing storage. For now, Kokkos always + // reallocates if any of the dimensions change, even if the old View + // has enough space. + view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 ); Kokkos::Impl::ViewRemap< view_type , view_type >( v_resized , v ); @@ -2317,6 +2407,106 @@ void realloc( Kokkos::View<T,P...> & v , } } /* namespace Kokkos */ +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { namespace Impl { + +template < class Specialize, typename A, typename B > +struct CommonViewValueType; + +template < typename A, typename B > +struct CommonViewValueType< void, A, B > +{ + using value_type = typename std::common_type< A , B >::type; +}; + + +template < class Specialize, class ValueType > +struct CommonViewAllocProp; + +template < class ValueType > +struct CommonViewAllocProp< void, ValueType > +{ + using value_type = ValueType; + + template < class ... Views > + CommonViewAllocProp( const Views & ... ) {} +}; + + +template < class ... Views > +struct DeduceCommonViewAllocProp; + +// Base case must provide types for: +// 1. specialize 2. value_type 3. is_view 4. prop_type +template < class FirstView > +struct DeduceCommonViewAllocProp< FirstView > +{ + using specialize = typename FirstView::traits::specialize; + + using value_type = typename FirstView::traits::value_type; + + enum : bool { is_view = is_view< FirstView >::value }; + + using prop_type = CommonViewAllocProp< specialize, value_type >; +}; + + +template < class FirstView, class ... NextViews > +struct DeduceCommonViewAllocProp< FirstView, NextViews... > +{ + using NextTraits = DeduceCommonViewAllocProp< NextViews... >; + + using first_specialize = typename FirstView::traits::specialize; + using first_value_type = typename FirstView::traits::value_type; + + enum : bool { first_is_view = is_view< FirstView >::value }; + + using next_specialize = typename NextTraits::specialize; + using next_value_type = typename NextTraits::value_type; + + enum : bool { next_is_view = NextTraits::is_view }; + + // common types + + // determine specialize type + // if first and next specialize differ, but are not the same specialize, error out + static_assert( !(!std::is_same< first_specialize, next_specialize >::value && !std::is_same< first_specialize, void>::value && !std::is_same< void, next_specialize >::value) , "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void specialize trait allowed" ); + + // otherwise choose non-void specialize if either/both are non-void + using specialize = typename std::conditional< std::is_same< first_specialize, next_specialize >::value + , first_specialize + , typename std::conditional< ( std::is_same< first_specialize, void >::value + && !std::is_same< next_specialize, void >::value) + , next_specialize + , first_specialize + >::type + >::type; + + using value_type = typename CommonViewValueType< specialize, first_value_type, next_value_type >::value_type; + + enum : bool { is_view = (first_is_view && next_is_view) }; + + using prop_type = CommonViewAllocProp< specialize, value_type >; +}; + +} // end namespace Impl + +template < class ... Views > +using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp<Views...>::prop_type ; + +// User function +template < class ... Views > +DeducedCommonPropsType<Views...> +common_view_alloc_prop( Views const & ... views ) +{ + return DeducedCommonPropsType<Views...>( views... ); +} + +} // namespace Kokkos + + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- // For backward compatibility: @@ -2350,6 +2540,9 @@ using Kokkos::Impl::WithoutInitializing_t ; using Kokkos::Impl::AllowPadding_t ; using Kokkos::Impl::SharedAllocationRecord ; using Kokkos::Impl::SharedAllocationTracker ; +using Kokkos::Impl::ViewMapping ; +using Kokkos::Impl::ViewDataAnalysis ; + } /* namespace Impl */ } /* namespace Experimental */ diff --git a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..58b0f72f51b07bb08819719c287a297cf97c85e7 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -0,0 +1,265 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_WORKGRAPHPOLICY_HPP +#define KOKKOS_WORKGRAPHPOLICY_HPP + +namespace Kokkos { +namespace Impl { +namespace Experimental { + +template< class functor_type , class execution_space, class ... policy_args > +class WorkGraphExec; + +}}} // namespace Kokkos::Impl::Experimental + +namespace Kokkos { +namespace Experimental { + +template< class ... Properties > +class WorkGraphPolicy +{ +public: + + using self_type = WorkGraphPolicy<Properties ... >; + using traits = Kokkos::Impl::PolicyTraits<Properties ... >; + using index_type = typename traits::index_type; + using execution_space = typename traits::execution_space; + using work_tag = typename traits::work_tag; + using memory_space = typename execution_space::memory_space; + using graph_type = Kokkos::Experimental::Crs<index_type, execution_space, void, index_type>; + using member_type = index_type; + +private: + + graph_type m_graph; + + using ints_type = Kokkos::View<std::int32_t*, memory_space>; + using range_type = Kokkos::pair<std::int32_t, std::int32_t>; + using ranges_type = Kokkos::View<range_type*, memory_space>; + const std::int32_t m_total_work; + ints_type m_counts; + ints_type m_queue; + ranges_type m_ranges; + +public: + + struct TagZeroRanges {}; + KOKKOS_INLINE_FUNCTION + void operator()(TagZeroRanges, std::int32_t i) const { + m_ranges[i] = range_type(0, 0); + } + void zero_ranges() { + using policy_type = RangePolicy<std::int32_t, execution_space, TagZeroRanges>; + using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; + const closure_type closure(*this, policy_type(0, 1)); + closure.execute(); + execution_space::fence(); + } + + struct TagFillQueue {}; + KOKKOS_INLINE_FUNCTION + void operator()(TagFillQueue, std::int32_t i) const { + if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i); + } + void fill_queue() { + using policy_type = RangePolicy<std::int32_t, execution_space, TagFillQueue>; + using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; + const closure_type closure(*this, policy_type(0, m_total_work)); + closure.execute(); + execution_space::fence(); + } + +private: + + inline + void setup() { + if (m_graph.numRows() > std::numeric_limits<std::int32_t>::max()) { + Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t"); + } + get_crs_transpose_counts(m_counts, m_graph); + m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work); + deep_copy(m_queue, std::int32_t(-1)); + m_ranges = ranges_type("ranges", 1); + fill_queue(); + } + + KOKKOS_INLINE_FUNCTION + std::int32_t pop_work() const { + range_type w(-1,-1); + while (true) { + const range_type w_new( w.first + 1 , w.second ); + w = atomic_compare_exchange( &m_ranges(0) , w , w_new ); + if ( w.first < w.second ) { // there was work in the queue + if ( w_new.first == w.first + 1 && w_new.second == w.second ) { + // we got a work item + std::int32_t i; + // the push_work function may have incremented the end counter + // but not yet written the work index into the queue. + // wait until the entry is valid. + while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) ); + return i; + } // we got a work item + } else { // there was no work in the queue +#ifdef KOKKOS_DEBUG + if ( w_new.first == w.first + 1 && w_new.second == w.second ) { + Kokkos::abort("bug in pop_work"); + } +#endif + if (w.first == m_total_work) { // all work is done + return -1; + } else { // need to wait for more work to be pushed + // take a guess that one work item will be pushed + // the key thing is we can't leave (w) alone, because + // otherwise the next compare_exchange may succeed in + // popping work from an empty queue + w.second++; + } + } // there was no work in the queue + } // while (true) + } + + KOKKOS_INLINE_FUNCTION + void push_work(std::int32_t i) const { + range_type w(-1,-1); + while (true) { + const range_type w_new( w.first , w.second + 1 ); + // try to increment the end counter + w = atomic_compare_exchange( &m_ranges(0) , w , w_new ); + // stop trying if the increment was successful + if ( w.first == w_new.first && w.second + 1 == w_new.second ) break; + } + // write the work index into the claimed spot in the queue + *((volatile std::int32_t*)(&m_queue( w.second ))) = i; + // push this write out into the memory system + memory_fence(); + } + + template< class functor_type , class execution_space, class ... policy_args > + friend class Kokkos::Impl::Experimental::WorkGraphExec; + +public: + + WorkGraphPolicy(graph_type arg_graph) + : m_graph(arg_graph) + , m_total_work( arg_graph.numRows() ) + { + setup(); + } + +}; + +}} // namespace Kokkos::Experimental + +/*--------------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +namespace Experimental { + +template< class functor_type , class execution_space, class ... policy_args > +class WorkGraphExec +{ + public: + + using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >; + using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >; + using member_type = typename policy_type::member_type; + using memory_space = typename execution_space::memory_space; + + protected: + + const functor_type m_functor; + const policy_type m_policy; + + protected: + + KOKKOS_INLINE_FUNCTION + std::int32_t before_work() const { + return m_policy.pop_work(); + } + + KOKKOS_INLINE_FUNCTION + void after_work(std::int32_t i) const { + /* fence any writes that were done by the work item itself + (usually writing its result to global memory) */ + memory_fence(); + const std::int32_t begin = m_policy.m_graph.row_map( i ); + const std::int32_t end = m_policy.m_graph.row_map( i + 1 ); + for (std::int32_t j = begin; j < end; ++j) { + const std::int32_t next = m_policy.m_graph.entries( j ); + const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 ); + if ( old_count == 1 ) m_policy.push_work( next ); + } + } + + inline + WorkGraphExec( const functor_type & arg_functor + , const policy_type & arg_policy ) + : m_functor( arg_functor ) + , m_policy( arg_policy ) + { + } +}; + +}}} // namespace Kokkos::Impl::Experimental + +#ifdef KOKKOS_ENABLE_SERIAL +#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp" +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +#include "OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp" +#endif + +#ifdef KOKKOS_ENABLE_CUDA +#include "Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp" +#endif + +#ifdef KOKKOS_ENABLE_THREADS +#include "Threads/Kokkos_Threads_WorkGraphPolicy.hpp" +#endif + +#endif /* #define KOKKOS_WORKGRAPHPOLICY_HPP */ diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp index 4e0ea93920cee66f3fde2f7f1d0f7351a0a67650..915fbe52c1dea2af5e5b9ccf7b34e73db7736b1b 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp @@ -45,76 +45,101 @@ #if defined( KOKKOS_ENABLE_OPENMP ) #include <cstdio> +#include <cstdlib> + #include <limits> #include <iostream> #include <vector> + #include <Kokkos_Core.hpp> + #include <impl/Kokkos_Error.hpp> -#include <iostream> #include <impl/Kokkos_CPUDiscovery.hpp> #include <impl/Kokkos_Profiling_Interface.hpp> namespace Kokkos { namespace Impl { -namespace { - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel(); - -int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 ); - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel() -{ -#ifndef __CUDA_ARCH__ - return omp_in_parallel() && ! kokkos_omp_in_critical_region ; -#else - return 0; -#endif -} - -bool s_using_hwloc = false; - -} // namespace -} // namespace Impl -} // namespace Kokkos - - -namespace Kokkos { -namespace Impl { - -int OpenMPExec::m_map_rank[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 }; -int OpenMPExec::m_pool_topo[ 4 ] = { 0 }; +int g_openmp_hardware_max_threads = 1; -HostThreadTeamData * OpenMPExec::m_pool[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 }; +__thread int t_openmp_hardware_id = 0; +__thread Impl::OpenMPExec * t_openmp_instance = nullptr; -void OpenMPExec::verify_is_process( const char * const label ) +void OpenMPExec::validate_partition( const int nthreads + , int & num_partitions + , int & partition_size + ) { - if ( omp_in_parallel() ) { - std::string msg( label ); - msg.append( " ERROR: in parallel" ); - Kokkos::Impl::throw_runtime_exception( msg ); + if (nthreads == 1) { + num_partitions = 1; + partition_size = 1; + } + else if( num_partitions < 1 && partition_size < 1) { + int idle = nthreads; + for (int np = 2; np <= nthreads ; ++np) { + for (int ps = 1; ps <= nthreads/np; ++ps) { + if (nthreads - np*ps < idle) { + idle = nthreads - np*ps; + num_partitions = np; + partition_size = ps; + } + if (idle == 0) { + break; + } + } + } } + else if( num_partitions < 1 && partition_size > 0 ) { + if ( partition_size <= nthreads ) { + num_partitions = nthreads / partition_size; + } + else { + num_partitions = 1; + partition_size = nthreads; + } + } + else if( num_partitions > 0 && partition_size < 1 ) { + if ( num_partitions <= nthreads ) { + partition_size = nthreads / num_partitions; + } + else { + num_partitions = nthreads; + partition_size = 1; + } + } + else if ( num_partitions * partition_size > nthreads ) { + int idle = nthreads; + const int NP = num_partitions; + const int PS = partition_size; + for (int np = NP; np > 0; --np) { + for (int ps = PS; ps > 0; --ps) { + if ( (np*ps <= nthreads) + && (nthreads - np*ps < idle) ) { + idle = nthreads - np*ps; + num_partitions = np; + partition_size = ps; + } + if (idle == 0) { + break; + } + } + } + } + } -void OpenMPExec::verify_initialized( const char * const label ) +void OpenMPExec::verify_is_master( const char * const label ) { - if ( 0 == m_pool[0] ) { - std::string msg( label ); - msg.append( " ERROR: not initialized" ); - Kokkos::Impl::throw_runtime_exception( msg ); - } - - if ( omp_get_max_threads() != Kokkos::OpenMP::thread_pool_size(0) ) { + if ( !t_openmp_instance ) + { std::string msg( label ); - msg.append( " ERROR: Initialized but threads modified inappropriately" ); + msg.append( " ERROR: in parallel or not initialized" ); Kokkos::Impl::throw_runtime_exception( msg ); } - } + } // namespace Impl } // namespace Kokkos @@ -133,11 +158,11 @@ void OpenMPExec::clear_thread_data() const int old_alloc_bytes = m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ; - Kokkos::HostSpace space ; + OpenMP::memory_space space ; -#pragma omp parallel + #pragma omp parallel num_threads( m_pool_size ) { - const int rank = m_map_rank[ omp_get_thread_num() ]; + const int rank = omp_get_thread_num(); if ( 0 != m_pool[rank] ) { @@ -189,13 +214,13 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes , team_shared_bytes , thread_local_bytes ); - const int pool_size = omp_get_max_threads(); + OpenMP::memory_space space ; - Kokkos::HostSpace space ; + memory_fence(); -#pragma omp parallel + #pragma omp parallel num_threads(m_pool_size) { - const int rank = m_map_rank[ omp_get_thread_num() ]; + const int rank = omp_get_thread_num(); if ( 0 != m_pool[rank] ) { @@ -214,11 +239,14 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes , pool_reduce_bytes , team_reduce_bytes , team_shared_bytes - , thread_local_bytes ); + , thread_local_bytes + ); + + memory_fence(); } /* END #pragma omp parallel */ - HostThreadTeamData::organize_pool( m_pool , pool_size ); + HostThreadTeamData::organize_pool( m_pool , m_pool_size ); } } @@ -232,16 +260,8 @@ namespace Kokkos { //---------------------------------------------------------------------------- -int OpenMP::is_initialized() -{ return 0 != Impl::OpenMPExec::m_pool[0]; } - -void OpenMP::initialize( unsigned thread_count , - unsigned use_numa_count , - unsigned use_cores_per_numa ) +int OpenMP::get_current_max_threads() noexcept { - // Before any other call to OMP query the maximum number of threads - // and save the value for re-initialization unit testing. - // Using omp_get_max_threads(); is problematic in conjunction with // Hwloc on Intel (essentially an initial call to the OpenMP runtime // without a parallel region before will set a process mask for a single core @@ -250,110 +270,99 @@ void OpenMP::initialize( unsigned thread_count , // the thread masks. The intend seems to be to make serial code run fast, if you // compile with OpenMP enabled but don't actually use parallel regions or so // static int omp_max_threads = omp_get_max_threads(); - int nthreads = 0; + + int count = 0; #pragma omp parallel { #pragma omp atomic - nthreads++; + ++count; } + return count; +} - static int omp_max_threads = nthreads; - - const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ; - - bool thread_spawn_failed = false ; - - if ( ! is_initialized ) { - - // Use hwloc thread pinning if concerned with locality. - // If spreading threads across multiple NUMA regions. - // If hyperthreading is enabled. - Impl::s_using_hwloc = hwloc::available() && ( - ( 1 < Kokkos::hwloc::get_available_numa_count() ) || - ( 1 < Kokkos::hwloc::get_available_threads_per_core() ) ); - std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPExec::MAX_THREAD_COUNT ]; +void OpenMP::initialize( int thread_count ) +{ + if ( omp_in_parallel() ) { + std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel"); + Kokkos::Impl::throw_runtime_exception(msg); + } - // If hwloc available then use it's maximum value. + if ( Impl::t_openmp_instance ) + { + finalize(); + } - if ( thread_count == 0 ) { - thread_count = Impl::s_using_hwloc - ? Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core() - : omp_max_threads ; + { + if (nullptr == std::getenv("OMP_PROC_BIND") ) { + printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n"); + printf(" In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n"); + printf(" For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n"); + printf(" For unit testing set OMP_PROC_BIND=false\n"); } - if(Impl::s_using_hwloc) - hwloc::thread_mapping( "Kokkos::OpenMP::initialize" , - false /* do not allow asynchronous */ , - thread_count , - use_numa_count , - use_cores_per_numa , - threads_coord ); + OpenMP::memory_space space ; - // Spawn threads: + // Before any other call to OMP query the maximum number of threads + // and save the value for re-initialization unit testing. - omp_set_num_threads( thread_count ); + Impl::g_openmp_hardware_max_threads = get_current_max_threads(); - // Verify OMP interaction: - if ( int(thread_count) != omp_get_max_threads() ) { - thread_spawn_failed = true ; - } - - // Verify spawning and bind threads: -#pragma omp parallel - { -#pragma omp critical - { - if ( int(thread_count) != omp_get_num_threads() ) { - thread_spawn_failed = true ; - } + int process_num_threads = Impl::g_openmp_hardware_max_threads; - // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region. - // Call to 'new' may not be thread safe as well. - - const unsigned omp_rank = omp_get_thread_num(); - const unsigned thread_r = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() - ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) - : omp_rank ; + if ( Kokkos::hwloc::available() ) { + process_num_threads = Kokkos::hwloc::get_available_numa_count() + * Kokkos::hwloc::get_available_cores_per_numa() + * Kokkos::hwloc::get_available_threads_per_core(); + } - Impl::OpenMPExec::m_map_rank[ omp_rank ] = thread_r ; + // if thread_count < 0, use g_openmp_hardware_max_threads; + // if thread_count == 0, set g_openmp_hardware_max_threads to process_num_threads + // if thread_count > 0, set g_openmp_hardware_max_threads to thread_count + if (thread_count < 0 ) { + thread_count = Impl::g_openmp_hardware_max_threads; + } + else if( thread_count == 0 && Impl::g_openmp_hardware_max_threads != process_num_threads ) { + Impl::g_openmp_hardware_max_threads = process_num_threads; + omp_set_num_threads(Impl::g_openmp_hardware_max_threads); + } + else { + if( thread_count > process_num_threads ) { + printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n"); + printf( " process threads available : %3d, requested thread : %3d\n", process_num_threads, thread_count ); } -/* END #pragma omp critical */ + Impl::g_openmp_hardware_max_threads = thread_count; + omp_set_num_threads(Impl::g_openmp_hardware_max_threads); } -/* END #pragma omp parallel */ - if ( ! thread_spawn_failed ) { - Impl::OpenMPExec::m_pool_topo[0] = thread_count ; - Impl::OpenMPExec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count; - Impl::OpenMPExec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1; - - // New, unified host thread team data: - { - size_t pool_reduce_bytes = 32 * thread_count ; - size_t team_reduce_bytes = 32 * thread_count ; - size_t team_shared_bytes = 1024 * thread_count ; - size_t thread_local_bytes = 1024 ; - - Impl::OpenMPExec::resize_thread_data( pool_reduce_bytes - , team_reduce_bytes - , team_shared_bytes - , thread_local_bytes - ); - } + // setup thread local + #pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads) + { + Impl::t_openmp_instance = nullptr; + Impl::t_openmp_hardware_id = omp_get_thread_num(); + Impl::SharedAllocationRecord< void, void >::tracking_enable(); } - } - if ( is_initialized || thread_spawn_failed ) { - std::string msg("Kokkos::OpenMP::initialize ERROR"); + void * const ptr = space.allocate( sizeof(Impl::OpenMPExec) ); - if ( is_initialized ) { msg.append(" : already initialized"); } - if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); } + Impl::t_openmp_instance = new (ptr) Impl::OpenMPExec( Impl::g_openmp_hardware_max_threads ); - Kokkos::Impl::throw_runtime_exception(msg); + // New, unified host thread team data: + { + size_t pool_reduce_bytes = 32 * thread_count ; + size_t team_reduce_bytes = 32 * thread_count ; + size_t team_shared_bytes = 1024 * thread_count ; + size_t thread_local_bytes = 1024 ; + + Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes + , team_reduce_bytes + , team_shared_bytes + , thread_local_bytes + ); + } } + // Check for over-subscription //if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) { // std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl; @@ -373,20 +382,38 @@ void OpenMP::initialize( unsigned thread_count , void OpenMP::finalize() { - Impl::OpenMPExec::verify_initialized( "OpenMP::finalize" ); - Impl::OpenMPExec::verify_is_process( "OpenMP::finalize" ); + if ( omp_in_parallel() ) + { + std::string msg("Kokkos::OpenMP::finalize ERROR "); + if( !Impl::t_openmp_instance ) msg.append(": not initialized"); + if( omp_in_parallel() ) msg.append(": in parallel"); + Kokkos::Impl::throw_runtime_exception(msg); + } + + if ( Impl::t_openmp_instance ) { - // New, unified host thread team data: - Impl::OpenMPExec::clear_thread_data(); + const int nthreads = Impl::t_openmp_instance->m_pool_size <= Impl::g_openmp_hardware_max_threads + ? Impl::g_openmp_hardware_max_threads + : Impl::t_openmp_instance->m_pool_size; - Impl::OpenMPExec::m_pool_topo[0] = 0 ; - Impl::OpenMPExec::m_pool_topo[1] = 0 ; - Impl::OpenMPExec::m_pool_topo[2] = 0 ; + using Exec = Impl::OpenMPExec; + Exec * instance = Impl::t_openmp_instance; + instance->~Exec(); - omp_set_num_threads(1); + OpenMP::memory_space space; + space.deallocate( instance, sizeof(Exec) ); - if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) { - hwloc::unbind_this_thread(); + #pragma omp parallel num_threads(nthreads) + { + Impl::t_openmp_hardware_id = 0; + Impl::t_openmp_instance = nullptr; + Impl::SharedAllocationRecord< void, void >::tracking_disable(); + } + + // allow main thread to track + Impl::SharedAllocationRecord< void, void >::tracking_enable(); + + Impl::g_openmp_hardware_max_threads = 1; } #if defined(KOKKOS_ENABLE_PROFILING) @@ -396,70 +423,48 @@ void OpenMP::finalize() //---------------------------------------------------------------------------- -void OpenMP::print_configuration( std::ostream & s , const bool detail ) +void OpenMP::print_configuration( std::ostream & s , const bool verbose ) { - Impl::OpenMPExec::verify_is_process( "OpenMP::print_configuration" ); - s << "Kokkos::OpenMP" ; -#if defined( KOKKOS_ENABLE_OPENMP ) - s << " KOKKOS_ENABLE_OPENMP" ; -#endif -#if defined( KOKKOS_ENABLE_HWLOC ) - - const unsigned numa_count_ = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); - - s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]" - << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" ) - ; -#endif - - const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ; + const bool is_initialized = Impl::t_openmp_instance != nullptr; if ( is_initialized ) { - const int numa_count = Kokkos::Impl::OpenMPExec::m_pool_topo[0] / Kokkos::Impl::OpenMPExec::m_pool_topo[1] ; - const int core_per_numa = Kokkos::Impl::OpenMPExec::m_pool_topo[1] / Kokkos::Impl::OpenMPExec::m_pool_topo[2] ; - const int thread_per_core = Kokkos::Impl::OpenMPExec::m_pool_topo[2] ; + Impl::OpenMPExec::verify_is_master( "OpenMP::print_configuration" ); + + const int numa_count = 1; + const int core_per_numa = Impl::g_openmp_hardware_max_threads; + const int thread_per_core = 1; s << " thread_pool_topology[ " << numa_count << " x " << core_per_numa << " x " << thread_per_core << " ]" << std::endl ; - - if ( detail ) { - std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPExec::m_pool_topo[0] ); - -#pragma omp parallel - { -#pragma omp critical - { - coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate(); - } -/* END #pragma omp critical */ - } -/* END #pragma omp parallel */ - - for ( unsigned i = 0 ; i < coord.size() ; ++i ) { - s << " thread omp_rank[" << i << "]" - << " kokkos_rank[" << Impl::OpenMPExec::m_map_rank[ i ] << "]" - << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]" - << std::endl ; - } - } } else { s << " not initialized" << std::endl ; } } +std::vector<OpenMP> OpenMP::partition(...) +{ return std::vector<OpenMP>(1); } + +OpenMP OpenMP::create_instance(...) { return OpenMP(); } + + +#if !defined( KOKKOS_DISABLE_DEPRECATED ) + int OpenMP::concurrency() { - return thread_pool_size(0); + return Impl::g_openmp_hardware_max_threads; +} + +void OpenMP::initialize( int thread_count , int, int ) +{ + initialize(thread_count); } -const char* OpenMP::name() { return "OpenMP"; } +#endif } // namespace Kokkos diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp index 75b7f5da4a4bf8f2dfa2aeb0b5726a543b17823f..37d2ac831801ac35feca1e250f19486ff842524e 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp @@ -47,6 +47,10 @@ #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ENABLE_OPENMP ) +#if !defined(_OPENMP) +#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!" +#endif + #include <Kokkos_OpenMP.hpp> #include <impl/Kokkos_Traits.hpp> @@ -54,6 +58,8 @@ #include <Kokkos_Atomic.hpp> +#include <Kokkos_UniqueToken.hpp> + #include <iostream> #include <sstream> #include <fstream> @@ -63,8 +69,14 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { +namespace Kokkos { namespace Impl { + +class OpenMPExec; + +extern int g_openmp_hardware_max_threads; + +extern __thread int t_openmp_hardware_id; +extern __thread OpenMPExec * t_openmp_instance; //---------------------------------------------------------------------------- /** \brief Data for OpenMP thread execution */ @@ -74,279 +86,279 @@ public: friend class Kokkos::OpenMP ; - enum { MAX_THREAD_COUNT = 4096 }; - -private: - - static int m_pool_topo[ 4 ]; - static int m_map_rank[ MAX_THREAD_COUNT ]; + enum { MAX_THREAD_COUNT = 512 }; - static HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ]; - - static void clear_thread_data(); -public: - - // Topology of a cache coherent thread pool: - // TOTAL = NUMA x GRAIN - // pool_size( depth = 0 ) - // pool_size(0) = total number of threads - // pool_size(1) = number of threads per NUMA - // pool_size(2) = number of threads sharing finest grain memory hierarchy + static void validate_partition( const int nthreads + , int & num_partitions + , int & partition_size + ); - inline static - int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; } +private: + OpenMPExec( int arg_pool_size ) + : m_pool_size{ arg_pool_size } + , m_level{ omp_get_level() } + , m_pool() + {} + + ~OpenMPExec() + { + clear_thread_data(); + } - static void finalize(); + int m_pool_size; + int m_level; - static void initialize( const unsigned team_count , - const unsigned threads_per_team , - const unsigned numa_count , - const unsigned cores_per_numa ); + HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ]; - static void verify_is_process( const char * const ); - static void verify_initialized( const char * const ); +public: + static void verify_is_master( const char * const ); - static void resize_thread_data( size_t pool_reduce_bytes , size_t team_reduce_bytes , size_t team_shared_bytes , size_t thread_local_bytes ); - inline static - HostThreadTeamData * get_thread_data() noexcept - { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; } + inline + HostThreadTeamData * get_thread_data() const noexcept + { return m_pool[ m_level == omp_get_level() ? 0 : omp_get_thread_num() ]; } - inline static - HostThreadTeamData * get_thread_data( int i ) noexcept - { return m_pool[i]; } + inline + HostThreadTeamData * get_thread_data( int i ) const noexcept + { return m_pool[i]; } }; -} // namespace Impl -} // namespace Kokkos +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Impl { - -template< class ... Properties > -class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...> -{ -public: - //! Tag this class as a kokkos execution policy - typedef TeamPolicyInternal execution_policy ; - - typedef PolicyTraits<Properties ... > traits; - - TeamPolicyInternal& operator = (const TeamPolicyInternal& p) { - m_league_size = p.m_league_size; - m_team_size = p.m_team_size; - m_team_alloc = p.m_team_alloc; - m_team_iter = p.m_team_iter; - m_team_scratch_size[0] = p.m_team_scratch_size[0]; - m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; - m_team_scratch_size[1] = p.m_team_scratch_size[1]; - m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; - m_chunk_size = p.m_chunk_size; - return *this; - } +inline OpenMP::OpenMP() noexcept +{} - //---------------------------------------- +inline +bool OpenMP::is_initialized() noexcept +{ return Impl::t_openmp_instance != nullptr; } - template< class FunctorType > - inline static - int team_size_max( const FunctorType & ) { - int pool_size = traits::execution_space::thread_pool_size(1); - int max_host_team_size = Impl::HostThreadTeamData::max_team_members; - return pool_size<max_host_team_size?pool_size:max_host_team_size; - } +inline +bool OpenMP::in_parallel( OpenMP const& ) noexcept +{ + //t_openmp_instance is only non-null on a master thread + return !Impl::t_openmp_instance + || Impl::t_openmp_instance->m_level < omp_get_level() + ; +} - template< class FunctorType > - inline static - int team_size_recommended( const FunctorType & ) - { return traits::execution_space::thread_pool_size(2); } +inline +int OpenMP::thread_pool_size() noexcept +{ + return OpenMP::in_parallel() + ? omp_get_num_threads() + : Impl::t_openmp_instance->m_pool_size + ; +} - template< class FunctorType > - inline static - int team_size_recommended( const FunctorType &, const int& ) - { return traits::execution_space::thread_pool_size(2); } +KOKKOS_INLINE_FUNCTION +int OpenMP::thread_pool_rank() noexcept +{ +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return Impl::t_openmp_instance ? 0 : omp_get_thread_num(); +#else + return -1 ; +#endif +} - //---------------------------------------- +inline +void OpenMP::fence( OpenMP const& instance ) noexcept {} -private: +inline +bool OpenMP::is_asynchronous( OpenMP const& instance ) noexcept +{ return false; } + +template <typename F> +void OpenMP::partition_master( F const& f + , int num_partitions + , int partition_size + ) +{ + if (omp_get_nested()) { + using Exec = Impl::OpenMPExec; - int m_league_size ; - int m_team_size ; - int m_team_alloc ; - int m_team_iter ; + Exec * prev_instance = Impl::t_openmp_instance; - size_t m_team_scratch_size[2]; - size_t m_thread_scratch_size[2]; + Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size ); - int m_chunk_size; + OpenMP::memory_space space; - inline void init( const int league_size_request - , const int team_size_request ) + #pragma omp parallel num_threads(num_partitions) { - const int pool_size = traits::execution_space::thread_pool_size(0); - const int max_host_team_size = Impl::HostThreadTeamData::max_team_members; - const int team_max = pool_size<max_host_team_size?pool_size:max_host_team_size; - const int team_grain = traits::execution_space::thread_pool_size(2); + void * const ptr = space.allocate( sizeof(Exec) ); - m_league_size = league_size_request ; + Impl::t_openmp_instance = new (ptr) Exec( partition_size ); - m_team_size = team_size_request < team_max ? - team_size_request : team_max ; + size_t pool_reduce_bytes = 32 * partition_size ; + size_t team_reduce_bytes = 32 * partition_size ; + size_t team_shared_bytes = 1024 * partition_size ; + size_t thread_local_bytes = 1024 ; - // Round team size up to a multiple of 'team_gain' - const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain ); - const int team_count = pool_size / team_size_grain ; + Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes + , team_reduce_bytes + , team_shared_bytes + , thread_local_bytes + ); - // Constraint : pool_size = m_team_alloc * team_count - m_team_alloc = pool_size / team_count ; + f( omp_get_thread_num(), omp_get_num_threads() ); - // Maxumum number of iterations each team will take: - m_team_iter = ( m_league_size + team_count - 1 ) / team_count ; - - set_auto_chunk_size(); + Impl::t_openmp_instance->~Exec(); + space.deallocate( Impl::t_openmp_instance, sizeof(Exec) ); + Impl::t_openmp_instance = nullptr; } -public: + Impl::t_openmp_instance = prev_instance; + } + else { + // nested openmp not enabled + f(0,1); + } +} - inline int team_size() const { return m_team_size ; } - inline int league_size() const { return m_league_size ; } - inline size_t scratch_size(const int& level, int team_size_ = -1) const { - if(team_size_ < 0) team_size_ = m_team_size; - return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ; - } +namespace Experimental { - /** \brief Specify league size, request team size */ - TeamPolicyInternal( typename traits::execution_space & - , int league_size_request - , int team_size_request - , int /* vector_length_request */ = 1 ) - : m_team_scratch_size { 0 , 0 } - , m_thread_scratch_size { 0 , 0 } - , m_chunk_size(0) - { init( league_size_request , team_size_request ); } - - TeamPolicyInternal( typename traits::execution_space & - , int league_size_request - , const Kokkos::AUTO_t & /* team_size_request */ - , int /* vector_length_request */ = 1) - : m_team_scratch_size { 0 , 0 } - , m_thread_scratch_size { 0 , 0 } - , m_chunk_size(0) - { init( league_size_request , traits::execution_space::thread_pool_size(2) ); } - - TeamPolicyInternal( int league_size_request - , int team_size_request - , int /* vector_length_request */ = 1 ) - : m_team_scratch_size { 0 , 0 } - , m_thread_scratch_size { 0 , 0 } - , m_chunk_size(0) - { init( league_size_request , team_size_request ); } - - TeamPolicyInternal( int league_size_request - , const Kokkos::AUTO_t & /* team_size_request */ - , int /* vector_length_request */ = 1 ) - : m_team_scratch_size { 0 , 0 } - , m_thread_scratch_size { 0 , 0 } - , m_chunk_size(0) - { init( league_size_request , traits::execution_space::thread_pool_size(2) ); } - - inline int team_alloc() const { return m_team_alloc ; } - inline int team_iter() const { return m_team_iter ; } - - inline int chunk_size() const { return m_chunk_size ; } - - /** \brief set chunk_size to a discrete value*/ - inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const { - TeamPolicyInternal p = *this; - p.m_chunk_size = chunk_size_; - return p; - } +template<> +class MasterLock<OpenMP> +{ +public: + void lock() { omp_set_lock( &m_lock ); } + void unlock() { omp_unset_lock( &m_lock ); } + bool try_lock() { return static_cast<bool>(omp_test_lock( &m_lock )); } + + MasterLock() { omp_init_lock( &m_lock ); } + ~MasterLock() { omp_destroy_lock( &m_lock ); } - inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const { - TeamPolicyInternal p = *this; - p.m_team_scratch_size[level] = per_team.value; - return p; - }; - - inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const { - TeamPolicyInternal p = *this; - p.m_thread_scratch_size[level] = per_thread.value; - return p; - }; - - inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const { - TeamPolicyInternal p = *this; - p.m_team_scratch_size[level] = per_team.value; - p.m_thread_scratch_size[level] = per_thread.value; - return p; - }; + MasterLock( MasterLock const& ) = delete; + MasterLock( MasterLock && ) = delete; + MasterLock & operator=( MasterLock const& ) = delete; + MasterLock & operator=( MasterLock && ) = delete; private: - /** \brief finalize chunk_size if it was set to AUTO*/ - inline void set_auto_chunk_size() { + omp_lock_t m_lock; - int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc; - if( concurrency==0 ) concurrency=1; +}; + +template<> +class UniqueToken< OpenMP, UniqueTokenScope::Instance> +{ +public: + using execution_space = OpenMP; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} - if(m_chunk_size > 0) { - if(!Impl::is_integral_power_of_two( m_chunk_size )) - Kokkos::abort("TeamPolicy blocking granularity must be power of two" ); + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept + { + #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return Kokkos::OpenMP::thread_pool_size(); + #else + return 0 ; + #endif } - int new_chunk_size = 1; - while(new_chunk_size*100*concurrency < m_league_size) - new_chunk_size *= 2; - if(new_chunk_size < 128) { - new_chunk_size = 1; - while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) ) - new_chunk_size*=2; + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept + { + #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return Kokkos::OpenMP::thread_pool_rank(); + #else + return 0 ; + #endif } - m_chunk_size = new_chunk_size; - } -public: - typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ; + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release( int ) const noexcept {} }; -} // namespace Impl -} // namespace Kokkos +template<> +class UniqueToken< OpenMP, UniqueTokenScope::Global> +{ +public: + using execution_space = OpenMP; + using size_type = int; -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} -namespace Kokkos { + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept + { + #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return Kokkos::Impl::g_openmp_hardware_max_threads ; + #else + return 0 ; + #endif + } -inline -bool OpenMP::in_parallel() -{ return omp_in_parallel(); } + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept + { + #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return Kokkos::Impl::t_openmp_hardware_id ; + #else + return 0 ; + #endif + } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release( int ) const noexcept {} +}; + +} // namespace Experimental + + +#if !defined( KOKKOS_DISABLE_DEPRECATED ) inline int OpenMP::thread_pool_size( int depth ) { - return Impl::OpenMPExec::pool_size(depth); + return depth < 2 + ? thread_pool_size() + : 1; } KOKKOS_INLINE_FUNCTION -int OpenMP::thread_pool_rank() +int OpenMP::hardware_thread_id() noexcept { #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) - return Impl::OpenMPExec::m_map_rank[ omp_get_thread_num() ]; + return Impl::t_openmp_hardware_id; #else return -1 ; #endif } +inline +int OpenMP::max_hardware_threads() noexcept +{ + return Impl::g_openmp_hardware_max_threads; +} + +#endif // KOKKOS_DISABLE_DEPRECATED + } // namespace Kokkos #endif diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp index c47e0fc654f78d7b9e3f46c9b2aa14bdc81e1fa3..b54abb00681cbbaecd0d6675b93586b5e62fd32f 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -52,6 +52,8 @@ #include <OpenMP/Kokkos_OpenMP_Exec.hpp> #include <impl/Kokkos_FunctorAdapter.hpp> +#include <KokkosExp_MDRangePolicy.hpp> + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -71,8 +73,9 @@ private: typedef typename Policy::WorkRange WorkRange ; typedef typename Policy::member_type Member ; - const FunctorType m_functor ; - const Policy m_policy ; + OpenMPExec * m_instance ; + const FunctorType m_functor ; + const Policy m_policy ; template< class TagType > inline static @@ -110,16 +113,120 @@ private: public: inline void execute() const + { + enum { is_dynamic = std::is_same< typename Policy::schedule_type::type + , Kokkos::Dynamic >::value + }; + + if ( OpenMP::in_parallel() ) { + exec_range< WorkTag >( m_functor + , m_policy.begin() + , m_policy.end() ); + } + else { + + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for"); + + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) + { + HostThreadTeamData & data = *(m_instance->get_thread_data()); + + data.set_work_partition( m_policy.end() - m_policy.begin() + , m_policy.chunk_size() ); + + if ( is_dynamic ) { + // Make sure work partition is set before stealing + if ( data.pool_rendezvous() ) data.pool_rendezvous_release(); + } + + std::pair<int64_t,int64_t> range(0,0); + + do { + + range = is_dynamic ? data.get_work_stealing_chunk() + : data.get_work_partition(); + + ParallelFor::template + exec_range< WorkTag >( m_functor + , range.first + m_policy.begin() + , range.second + m_policy.begin() ); + + } while ( is_dynamic && 0 <= range.first ); + } + } + } + + inline + ParallelFor( const FunctorType & arg_functor + , Policy arg_policy ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) + , m_policy( arg_policy ) + {} +}; + + +// MDRangePolicy impl +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ... > + , Kokkos::OpenMP + > +{ +private: + + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + typedef typename MDRangePolicy::work_tag WorkTag ; + + typedef typename Policy::WorkRange WorkRange ; + typedef typename Policy::member_type Member ; + + typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; + + OpenMPExec * m_instance ; + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor + + inline static + void + exec_range( const MDRangePolicy & mdr_policy + , const FunctorType & functor + , const Member ibeg , const Member iend ) { + #ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP + #pragma ivdep + #endif + #endif + for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) { + iterate_type( mdr_policy, functor )( iwork ); + } + } + +public: + + inline void execute() const + { enum { is_dynamic = std::is_same< typename Policy::schedule_type::type , Kokkos::Dynamic >::value }; - OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for"); - OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for"); + if ( OpenMP::in_parallel() ) { + ParallelFor::exec_range ( m_mdr_policy + , m_functor + , m_policy.begin() + , m_policy.end() ); + } + else { + + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for"); -#pragma omp parallel + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) { - HostThreadTeamData & data = *OpenMPExec::get_thread_data(); + HostThreadTeamData & data = *(m_instance->get_thread_data()); data.set_work_partition( m_policy.end() - m_policy.begin() , m_policy.chunk_size() ); @@ -136,8 +243,8 @@ public: range = is_dynamic ? data.get_work_stealing_chunk() : data.get_work_partition(); - ParallelFor::template - exec_range< WorkTag >( m_functor + ParallelFor::exec_range( m_mdr_policy + , m_functor , range.first + m_policy.begin() , range.second + m_policy.begin() ); @@ -145,12 +252,15 @@ public: } // END #pragma omp parallel } + } inline ParallelFor( const FunctorType & arg_functor - , Policy arg_policy ) - : m_functor( arg_functor ) - , m_policy( arg_policy ) + , MDRangePolicy arg_policy ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) {} }; @@ -191,10 +301,11 @@ private: typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; - const FunctorType m_functor ; - const Policy m_policy ; - const ReducerType m_reducer ; - const pointer_type m_result_ptr ; + OpenMPExec * m_instance; + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; template< class TagType > inline static @@ -228,21 +339,21 @@ public: enum { is_dynamic = std::is_same< typename Policy::schedule_type::type , Kokkos::Dynamic >::value }; - OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce"); - OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce"); + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce"); const size_t pool_reduce_bytes = Analysis::value_size( ReducerConditional::select(m_functor, m_reducer)); - OpenMPExec::resize_thread_data( pool_reduce_bytes + m_instance->resize_thread_data( pool_reduce_bytes , 0 // team_reduce_bytes , 0 // team_shared_bytes , 0 // thread_local_bytes ); -#pragma omp parallel + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) { - HostThreadTeamData & data = *OpenMPExec::get_thread_data(); + HostThreadTeamData & data = *(m_instance->get_thread_data()); data.set_work_partition( m_policy.end() - m_policy.begin() , m_policy.chunk_size() ); @@ -271,16 +382,15 @@ public: } while ( is_dynamic && 0 <= range.first ); } -// END #pragma omp parallel // Reduction: - const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() ); + const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() ); - for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) { + for ( int i = 1 ; i < pool_size ; ++i ) { ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr - , OpenMPExec::get_thread_data(i)->pool_reduce_local() ); + , m_instance->get_thread_data(i)->pool_reduce_local() ); } Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); @@ -303,7 +413,8 @@ public: Kokkos::is_view< ViewType >::value && !Kokkos::is_reducer_type<ReducerType>::value ,void*>::type = NULL) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) , m_reducer( InvalidType() ) , m_result_ptr( arg_view.data() ) @@ -317,7 +428,8 @@ public: ParallelReduce( const FunctorType & arg_functor , Policy arg_policy , const ReducerType& reducer ) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) , m_reducer( reducer ) , m_result_ptr( reducer.view().data() ) @@ -329,6 +441,173 @@ public: }; + +// MDRangePolicy impl +template< class FunctorType , class ReducerType, class ... Traits > +class ParallelReduce< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ...> + , ReducerType + , Kokkos::OpenMP + > +{ +private: + + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + + typedef typename MDRangePolicy::work_tag WorkTag ; + typedef typename Policy::WorkRange WorkRange ; + typedef typename Policy::member_type Member ; + + typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ; + + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef typename ReducerTypeFwd::value_type ValueType; + + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; + + typedef typename Analysis::pointer_type pointer_type ; + typedef typename Analysis::reference_type reference_type ; + + using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy + , FunctorType + , WorkTag + , ValueType + >; + + OpenMPExec * m_instance ; + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor + const ReducerType m_reducer ; + const pointer_type m_result_ptr ; + + inline static + void + exec_range( const MDRangePolicy & mdr_policy + , const FunctorType & functor + , const Member ibeg , const Member iend + , reference_type update ) + { + for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) { + iterate_type( mdr_policy, functor, update )( iwork ); + } + } + +public: + + inline void execute() const + { + enum { is_dynamic = std::is_same< typename Policy::schedule_type::type + , Kokkos::Dynamic >::value }; + + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce"); + + const size_t pool_reduce_bytes = + Analysis::value_size( ReducerConditional::select(m_functor, m_reducer)); + + m_instance->resize_thread_data( pool_reduce_bytes + , 0 // team_reduce_bytes + , 0 // team_shared_bytes + , 0 // thread_local_bytes + ); + + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) + { + HostThreadTeamData & data = *(m_instance->get_thread_data()); + + data.set_work_partition( m_policy.end() - m_policy.begin() + , m_policy.chunk_size() ); + + if ( is_dynamic ) { + // Make sure work partition is set before stealing + if ( data.pool_rendezvous() ) data.pool_rendezvous_release(); + } + + reference_type update = + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) + , data.pool_reduce_local() ); + + std::pair<int64_t,int64_t> range(0,0); + + do { + + range = is_dynamic ? data.get_work_stealing_chunk() + : data.get_work_partition(); + + ParallelReduce::exec_range ( m_mdr_policy, m_functor + , range.first + m_policy.begin() + , range.second + m_policy.begin() + , update ); + + } while ( is_dynamic && 0 <= range.first ); + } +// END #pragma omp parallel + + // Reduction: + + const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() ); + + for ( int i = 1 ; i < pool_size ; ++i ) { + ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) + , ptr + , m_instance->get_thread_data(i)->pool_reduce_local() ); + } + + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); + + if ( m_result_ptr ) { + const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) ); + + for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; } + } + } + + //---------------------------------------- + + template< class ViewType > + inline + ParallelReduce( const FunctorType & arg_functor + , MDRangePolicy arg_policy + , const ViewType & arg_view + , typename std::enable_if< + Kokkos::is_view< ViewType >::value && + !Kokkos::is_reducer_type<ReducerType>::value + ,void*>::type = NULL) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( InvalidType() ) + , m_result_ptr( arg_view.data() ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } + + inline + ParallelReduce( const FunctorType & arg_functor + , MDRangePolicy arg_policy + , const ReducerType& reducer ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( reducer ) + , m_result_ptr( reducer.view().data() ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } + +}; + } // namespace Impl } // namespace Kokkos @@ -361,8 +640,9 @@ private: typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; - const FunctorType m_functor ; - const Policy m_policy ; + OpenMPExec * m_instance; + const FunctorType m_functor; + const Policy m_policy; template< class TagType > inline static @@ -394,23 +674,23 @@ public: inline void execute() const { - OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_scan"); - OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_scan"); + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_scan"); const int value_count = Analysis::value_count( m_functor ); const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor ); - OpenMPExec::resize_thread_data( pool_reduce_bytes + m_instance->resize_thread_data( pool_reduce_bytes , 0 // team_reduce_bytes , 0 // team_shared_bytes , 0 // thread_local_bytes ); -#pragma omp parallel + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) { - HostThreadTeamData & data = *OpenMPExec::get_thread_data(); + HostThreadTeamData & data = *(m_instance->get_thread_data()); - const WorkRange range( m_policy, data.pool_rank(), data.pool_size() ); + const WorkRange range( m_policy, omp_get_thread_num(), omp_get_num_threads() ); reference_type update_sum = ValueInit::init( m_functor , data.pool_reduce_local() ); @@ -422,7 +702,7 @@ public: pointer_type ptr_prev = 0 ; - const int n = data.pool_size(); + const int n = omp_get_num_threads(); for ( int i = 0 ; i < n ; ++i ) { @@ -452,7 +732,6 @@ public: ParallelScan::template exec_range< WorkTag > ( m_functor , range.begin() , range.end() , update_base , true ); } -/* END #pragma omp parallel */ } @@ -461,7 +740,8 @@ public: inline ParallelScan( const FunctorType & arg_functor , const Policy & arg_policy ) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) {} @@ -492,9 +772,10 @@ private: typedef typename Policy::schedule_type::type SchedTag ; typedef typename Policy::member_type Member ; - const FunctorType m_functor ; - const Policy m_policy ; - const int m_shmem_size ; + OpenMPExec * m_instance; + const FunctorType m_functor; + const Policy m_policy; + const int m_shmem_size; template< class TagType > inline static @@ -548,22 +829,22 @@ public: { enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value }; - OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for"); - OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for"); + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for"); const size_t pool_reduce_size = 0 ; // Never shrinks const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size(); const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1); const size_t thread_local_size = 0 ; // Never shrinks - OpenMPExec::resize_thread_data( pool_reduce_size + m_instance->resize_thread_data( pool_reduce_size , team_reduce_size , team_shared_size , thread_local_size ); -#pragma omp parallel + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) { - HostThreadTeamData & data = *OpenMPExec::get_thread_data(); + HostThreadTeamData & data = *(m_instance->get_thread_data()); const int active = data.organize_team( m_policy.team_size() ); @@ -598,14 +879,14 @@ public: data.disband_team(); } -// END #pragma omp parallel } inline ParallelFor( const FunctorType & arg_functor , const Policy & arg_policy ) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + @@ -646,11 +927,12 @@ private: typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; - const FunctorType m_functor ; - const Policy m_policy ; - const ReducerType m_reducer ; - const pointer_type m_result_ptr ; - const int m_shmem_size ; + OpenMPExec * m_instance; + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const int m_shmem_size; template< class TagType > inline static @@ -706,8 +988,7 @@ public: { enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value }; - OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce"); - OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce"); + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce"); const size_t pool_reduce_size = Analysis::value_size( ReducerConditional::select(m_functor, m_reducer)); @@ -716,14 +997,15 @@ public: const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1); const size_t thread_local_size = 0 ; // Never shrinks - OpenMPExec::resize_thread_data( pool_reduce_size + m_instance->resize_thread_data( pool_reduce_size , team_reduce_size , team_shared_size , thread_local_size ); -#pragma omp parallel + const int pool_size = OpenMP::thread_pool_size(); + #pragma omp parallel num_threads(pool_size) { - HostThreadTeamData & data = *OpenMPExec::get_thread_data(); + HostThreadTeamData & data = *(m_instance->get_thread_data()); const int active = data.organize_team( m_policy.team_size() ); @@ -763,17 +1045,26 @@ public: } data.disband_team(); + + // This thread has updated 'pool_reduce_local()' with its + // contributions to the reduction. The parallel region is + // about to terminate and the master thread will load and + // reduce each 'pool_reduce_local()' contribution. + // Must 'memory_fence()' to guarantee that storing the update to + // 'pool_reduce_local()' will complete before this thread + // exits the parallel region. + + memory_fence(); } -// END #pragma omp parallel // Reduction: - const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() ); + const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() ); - for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) { + for ( int i = 1 ; i < pool_size ; ++i ) { ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr - , OpenMPExec::get_thread_data(i)->pool_reduce_local() ); + , m_instance->get_thread_data(i)->pool_reduce_local() ); } Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); @@ -796,7 +1087,8 @@ public: Kokkos::is_view< ViewType >::value && !Kokkos::is_reducer_type<ReducerType>::value ,void*>::type = NULL) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) , m_reducer( InvalidType() ) , m_result_ptr( arg_result.ptr_on_device() ) @@ -810,7 +1102,8 @@ public: ParallelReduce( const FunctorType & arg_functor , Policy arg_policy , const ReducerType& reducer ) - : m_functor( arg_functor ) + : m_instance( t_openmp_instance ) + , m_functor( arg_functor ) , m_policy( arg_policy ) , m_reducer( reducer ) , m_result_ptr( reducer.view().data() ) diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp index d4ade211f804982692dec8c63e75a83bea8778cd..77363876b00dbb4f8979b83c366b94dc91d4b93f 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp @@ -105,7 +105,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute { using execution_space = Kokkos::OpenMP ; using queue_type = TaskQueue< execution_space > ; - using task_root_type = TaskBase< execution_space , void , void > ; + using task_root_type = TaskBase< void , void , void > ; using Member = Impl::HostThreadTeamMember< execution_space > ; static task_root_type * const end = @@ -115,23 +115,19 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute HostThreadTeamData & team_data_single = HostThreadTeamDataSingleton::singleton(); - const int team_size = Impl::OpenMPExec::pool_size(2); // Threads per core - // const int team_size = Impl::OpenMPExec::pool_size(1); // Threads per NUMA + Impl::OpenMPExec * instance = t_openmp_instance; + const int pool_size = OpenMP::thread_pool_size(); -#if 0 -fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size ); -fflush(stdout); -#endif + const int team_size = 1; // Threads per core + instance->resize_thread_data( 0 /* global reduce buffer */ + , 512 * team_size /* team reduce buffer */ + , 0 /* team shared buffer */ + , 0 /* thread local buffer */ + ); - OpenMPExec::resize_thread_data( 0 /* global reduce buffer */ - , 512 * team_size /* team reduce buffer */ - , 0 /* team shared buffer */ - , 0 /* thread local buffer */ - ); - -#pragma omp parallel + #pragma omp parallel num_threads(pool_size) { - Impl::HostThreadTeamData & self = *Impl::OpenMPExec::get_thread_data(); + Impl::HostThreadTeamData & self = *(instance->get_thread_data()); // Organizing threads into a team performs a barrier across the // entire pool to insure proper initialization of the team @@ -142,18 +138,6 @@ fflush(stdout); Member single_exec( team_data_single ); Member team_exec( self ); -#if 0 -fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) running\n" - , self.pool_rank() - , self.pool_size() - , team_exec.team_rank() - , team_exec.team_size() - , team_exec.league_rank() - , team_exec.league_size() - ); -fflush(stdout); -#endif - // Loop until all queues are empty and no tasks in flight task_root_type * task = 0 ; @@ -197,15 +181,6 @@ fflush(stdout); // if a single thread task then execute now -#if 0 -fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) executing single task 0x%lx\n" - , self.pool_rank() - , self.pool_size() - , int64_t(task) - ); -fflush(stdout); -#endif - (*task->m_apply)( task , & single_exec ); leader_loop = true ; @@ -220,57 +195,14 @@ fflush(stdout); if ( 0 != task ) { // Thread Team Task -#if 0 -fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team((%d of %d) league(%d of %d) executing team task 0x%lx\n" - , self.pool_rank() - , self.pool_size() - , team_exec.team_rank() - , team_exec.team_size() - , team_exec.league_rank() - , team_exec.league_size() - , int64_t(task) - ); -fflush(stdout); -#endif - (*task->m_apply)( task , & team_exec ); // The m_apply function performs a barrier } } while( 0 != task ); - -#if 0 -fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) ending\n" - , self.pool_rank() - , self.pool_size() - , team_exec.team_rank() - , team_exec.team_size() - , team_exec.league_rank() - , team_exec.league_size() - ); -fflush(stdout); -#endif - } - self.disband_team(); - -#if 0 -fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) disbanded\n" - , self.pool_rank() - , self.pool_size() - ); -fflush(stdout); -#endif - } -// END #pragma omp parallel - -#if 0 -fprintf(stdout,"TaskQueue<OpenMP> execute %d end\n", team_size ); -fflush(stdout); -#endif - } void TaskQueueSpecialization< Kokkos::OpenMP >:: @@ -279,10 +211,10 @@ void TaskQueueSpecialization< Kokkos::OpenMP >:: { using execution_space = Kokkos::OpenMP ; using queue_type = TaskQueue< execution_space > ; - using task_root_type = TaskBase< execution_space , void , void > ; + using task_root_type = TaskBase< void , void , void > ; using Member = Impl::HostThreadTeamMember< execution_space > ; - if ( 1 == omp_get_num_threads() ) { + if ( 1 == OpenMP::thread_pool_size() ) { task_root_type * const end = (task_root_type *) task_root_type::EndTag ; diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp index 82fbef255b7fced3ab68ee962587e4dd45b646fb..dfa1635e08cff2a638267f15bf09082e3112be89 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -45,7 +45,7 @@ #define KOKKOS_IMPL_OPENMP_TASK_HPP #include <Kokkos_Macros.hpp> -#if defined( KOKKOS_ENABLE_TASKDAG ) +#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG ) //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -60,7 +60,7 @@ public: using execution_space = Kokkos::OpenMP ; using queue_type = Kokkos::Impl::TaskQueue< execution_space > ; - using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ; + using task_base_type = Kokkos::Impl::TaskBase< void , void , void > ; using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ; // Must specify memory space diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp new file mode 100644 index 0000000000000000000000000000000000000000..743e6b6e62031aebf4ea670c4cd4a971a5e149a7 --- /dev/null +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp @@ -0,0 +1,245 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMP_TEAM_HPP +#define KOKKOS_OPENMP_TEAM_HPP + +#include <Kokkos_Macros.hpp> +#if defined( KOKKOS_ENABLE_OPENMP ) + +#include <OpenMP/Kokkos_OpenMP_Exec.hpp> + +namespace Kokkos { namespace Impl { + +template< class ... Properties > +class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...> +{ +public: + + //! Tag this class as a kokkos execution policy + typedef TeamPolicyInternal execution_policy ; + + typedef PolicyTraits<Properties ... > traits; + + TeamPolicyInternal& operator = (const TeamPolicyInternal& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_team_alloc = p.m_team_alloc; + m_team_iter = p.m_team_iter; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + return *this; + } + + //---------------------------------------- + + template< class FunctorType > + inline static + int team_size_max( const FunctorType & ) { + int pool_size = traits::execution_space::thread_pool_size(1); + int max_host_team_size = Impl::HostThreadTeamData::max_team_members; + return pool_size<max_host_team_size?pool_size:max_host_team_size; + } + + template< class FunctorType > + inline static + int team_size_recommended( const FunctorType & ) + { return traits::execution_space::thread_pool_size(2); } + + template< class FunctorType > + inline static + int team_size_recommended( const FunctorType &, const int& ) + { return traits::execution_space::thread_pool_size(2); } + + //---------------------------------------- + +private: + + int m_league_size ; + int m_team_size ; + int m_team_alloc ; + int m_team_iter ; + + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; + + int m_chunk_size; + + inline void init( const int league_size_request + , const int team_size_request ) + { + const int pool_size = traits::execution_space::thread_pool_size(0); + const int max_host_team_size = Impl::HostThreadTeamData::max_team_members; + const int team_max = pool_size<max_host_team_size?pool_size:max_host_team_size; + const int team_grain = traits::execution_space::thread_pool_size(2); + + m_league_size = league_size_request ; + + m_team_size = team_size_request < team_max ? + team_size_request : team_max ; + + // Round team size up to a multiple of 'team_gain' + const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain ); + const int team_count = pool_size / team_size_grain ; + + // Constraint : pool_size = m_team_alloc * team_count + m_team_alloc = pool_size / team_count ; + + // Maxumum number of iterations each team will take: + m_team_iter = ( m_league_size + team_count - 1 ) / team_count ; + + set_auto_chunk_size(); + } + +public: + + inline int team_size() const { return m_team_size ; } + inline int league_size() const { return m_league_size ; } + + inline size_t scratch_size(const int& level, int team_size_ = -1) const { + if(team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ; + } + + /** \brief Specify league size, request team size */ + TeamPolicyInternal( typename traits::execution_space & + , int league_size_request + , int team_size_request + , int /* vector_length_request */ = 1 ) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } + , m_chunk_size(0) + { init( league_size_request , team_size_request ); } + + TeamPolicyInternal( typename traits::execution_space & + , int league_size_request + , const Kokkos::AUTO_t & /* team_size_request */ + , int /* vector_length_request */ = 1) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } + , m_chunk_size(0) + { init( league_size_request , traits::execution_space::thread_pool_size(2) ); } + + TeamPolicyInternal( int league_size_request + , int team_size_request + , int /* vector_length_request */ = 1 ) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } + , m_chunk_size(0) + { init( league_size_request , team_size_request ); } + + TeamPolicyInternal( int league_size_request + , const Kokkos::AUTO_t & /* team_size_request */ + , int /* vector_length_request */ = 1 ) + : m_team_scratch_size { 0 , 0 } + , m_thread_scratch_size { 0 , 0 } + , m_chunk_size(0) + { init( league_size_request , traits::execution_space::thread_pool_size(2) ); } + + inline int team_alloc() const { return m_team_alloc ; } + inline int team_iter() const { return m_team_iter ; } + + inline int chunk_size() const { return m_chunk_size ; } + + /** \brief set chunk_size to a discrete value*/ + inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const { + TeamPolicyInternal p = *this; + p.m_chunk_size = chunk_size_; + return p; + } + + inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const { + TeamPolicyInternal p = *this; + p.m_team_scratch_size[level] = per_team.value; + return p; + }; + + inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const { + TeamPolicyInternal p = *this; + p.m_thread_scratch_size[level] = per_thread.value; + return p; + }; + + inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const { + TeamPolicyInternal p = *this; + p.m_team_scratch_size[level] = per_team.value; + p.m_thread_scratch_size[level] = per_thread.value; + return p; + }; + +private: + /** \brief finalize chunk_size if it was set to AUTO*/ + inline void set_auto_chunk_size() { + + int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc; + if( concurrency==0 ) concurrency=1; + + if(m_chunk_size > 0) { + if(!Impl::is_integral_power_of_two( m_chunk_size )) + Kokkos::abort("TeamPolicy blocking granularity must be power of two" ); + } + + int new_chunk_size = 1; + while(new_chunk_size*100*concurrency < m_league_size) + new_chunk_size *= 2; + if(new_chunk_size < 128) { + new_chunk_size = 1; + while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) ) + new_chunk_size*=2; + } + m_chunk_size = new_chunk_size; + } + +public: + typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ; +}; + +}} // namespace Kokkos::Impl + +#endif +#endif /* KOKKOS_OPENMP_TEAM_HPP */ + + diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..289ad15451dd0da1c604c4ace8686a9a7fa71542 --- /dev/null +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp @@ -0,0 +1,107 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP +#define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType , + Kokkos::Experimental::WorkGraphPolicy< Traits ... > , + Kokkos::OpenMP + > + : public Kokkos::Impl::Experimental:: + WorkGraphExec< FunctorType, + Kokkos::OpenMP, + Traits ... + > +{ +private: + + typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ; + typedef Kokkos::Impl::Experimental:: + WorkGraphExec<FunctorType, Kokkos::OpenMP, Traits ... > Base ; + + template< class TagType > + typename std::enable_if< std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + Base::m_functor( i ); + } + + template< class TagType > + typename std::enable_if< ! std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + const TagType t{} ; + Base::m_functor( t , i ); + } + +public: + + inline + void execute() + { + const int pool_size = OpenMP::thread_pool_size(); + + #pragma omp parallel num_threads(pool_size) + { + for (std::int32_t i; (-1 != (i = Base::before_work())); ) { + exec_one< typename Policy::work_tag >( i ); + Base::after_work(i); + } + } + } + + inline + ParallelFor( const FunctorType & arg_functor + , const Policy & arg_policy ) + : Base( arg_functor, arg_policy ) + { + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp index bec7844ed649ba11d919eeb94f6a6f50ef56a2fb..258a9d2ff703907913cff7928e05b38785245b64 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -45,7 +45,7 @@ #define KOKKOS_OPENMPTARGETEXEC_HPP #include <impl/Kokkos_Traits.hpp> -#include <impl/Kokkos_spinwait.hpp> +#include <impl/Kokkos_Spinwait.hpp> #include <Kokkos_Atomic.hpp> #include <iostream> @@ -59,10 +59,10 @@ namespace Impl { class OpenMPTargetExec { -public: +public: enum { MAX_ACTIVE_THREADS = 256*8*56*4 }; enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS/32 }; - + private: static void* scratch_ptr; @@ -70,7 +70,7 @@ public: static void verify_is_process( const char * const ); static void verify_initialized( const char * const ); - static void* get_scratch_ptr(); + static void* get_scratch_ptr(); static void clear_scratch(); static void resize_scratch( int64_t reduce_bytes , int64_t team_reduce_bytes, int64_t team_shared_bytes, int64_t thread_local_bytes ); @@ -159,7 +159,7 @@ public: KOKKOS_INLINE_FUNCTION void team_barrier() const { - #pragma omp barrier + #pragma omp barrier } template<class ValueType> @@ -191,13 +191,13 @@ public: typedef ValueType value_type; const JoinLambdaAdapter<value_type,JoinOp> op(op_in); - + // Make sure there is enough scratch space: typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE , value_type , void >::type type ; const int n_values = TEAM_REDUCE_SIZE/sizeof(value_type); - type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num()); + type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num()); for(int i = m_team_rank; i < n_values; i+= m_team_size) { team_scratch[i] = value_type(); } @@ -209,7 +209,7 @@ public: team_scratch[m_team_rank%n_values]+=value; #pragma omp barrier } - + for(int d = 1; d<n_values;d*=2) { if((m_team_rank+d<n_values) && (m_team_rank%(2*d)==0)) { team_scratch[m_team_rank] += team_scratch[m_team_rank+d]; @@ -374,12 +374,12 @@ private: int m_chunk_size; inline void init( const int league_size_request - , const int team_size_request + , const int team_size_request , const int vector_length_request ) { m_league_size = league_size_request ; - m_team_size = team_size_request; + m_team_size = team_size_request; m_vector_length = vector_length_request; diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp index c3b773e073deb8f60f12eb4a89978617768154af..abf390b176e6fd87e9f1fafe254f09a9972c5c1f 100644 --- a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp +++ b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp @@ -47,7 +47,7 @@ #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ENABLE_QTHREADS ) -#include <impl/Kokkos_spinwait.hpp> +#include <impl/Kokkos_Spinwait.hpp> //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp index 4c805310cc38ca8cabcadb431939150354018427..35b2163ae5fcdf19daf2468f832e30779a3fa995 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp @@ -45,14 +45,14 @@ #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ENABLE_THREADS ) -#include <Kokkos_Core_fwd.hpp> - #include <cstdint> #include <limits> #include <utility> #include <iostream> #include <sstream> + #include <Kokkos_Core.hpp> + #include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_CPUDiscovery.hpp> #include <impl/Kokkos_Profiling_Interface.hpp> @@ -80,9 +80,7 @@ const void * volatile s_current_function_arg = 0 ; struct Sentinel { Sentinel() - { - HostSpace::register_in_parallel( ThreadsExec::in_parallel ); - } + {} ~Sentinel() { @@ -122,6 +120,8 @@ void execute_function_noop( ThreadsExec & , const void * ) {} void ThreadsExec::driver(void) { + SharedAllocationRecord< void, void >::tracking_enable(); + ThreadsExec this_thread ; while ( ThreadsExec::Active == this_thread.m_pool_state ) { @@ -726,6 +726,8 @@ void ThreadsExec::initialize( unsigned thread_count , // Init the array for used for arbitrarily sized atomics Impl::init_lock_array_host_space(); + Impl::SharedAllocationRecord< void, void >::tracking_enable(); + #if defined(KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::initialize(); #endif diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp index 74de3a2596f26bd399eb3eb9c4be2caa46362160..7557bad7d92f2cfff9c42b56c7feb0770be68598 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp @@ -50,11 +50,12 @@ #include <cstdio> #include <utility> -#include <impl/Kokkos_spinwait.hpp> +#include <impl/Kokkos_Spinwait.hpp> #include <impl/Kokkos_FunctorAdapter.hpp> #include <Kokkos_Atomic.hpp> +#include <Kokkos_UniqueToken.hpp> //---------------------------------------------------------------------------- namespace Kokkos { @@ -275,6 +276,17 @@ public: if ( ! rev_rank ) { Final::final( f , reduce_memory() ); } + + // This thread has updated 'reduce_memory()' and upon returning + // from this function will set 'm_pool_state' to inactive. + // If this is a non-root thread then setting 'm_pool_state' + // to inactive triggers another thread to exit a spinwait + // and read the 'reduce_memory'. + // Must 'memory_fence()' to guarantee that storing the update to + // 'reduce_memory()' will complete before storing the the update to + // 'm_pool_state'. + + memory_fence(); } inline @@ -627,6 +639,62 @@ inline void Threads::fence() } /* namespace Kokkos */ +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { namespace Experimental { + +template<> +class UniqueToken< Threads, UniqueTokenScope::Instance> +{ +public: + using execution_space = Threads; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + inline + int size() const noexcept { return Threads::thread_pool_size(); } + + /// \brief acquire value such that 0 <= value < size() + inline + int acquire() const noexcept { return Threads::thread_pool_rank(); } + + /// \brief release a value acquired by generate + inline + void release( int ) const noexcept {} +}; + +template<> +class UniqueToken< Threads, UniqueTokenScope::Global> +{ +public: + using execution_space = Threads; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken( execution_space const& = execution_space() ) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + inline + int size() const noexcept { return Threads::thread_pool_size(); } + + /// \brief acquire value such that 0 <= value < size() + inline + int acquire() const noexcept { return Threads::thread_pool_rank(); } + + /// \brief release a value acquired by generate + inline + void release( int ) const noexcept {} +}; + +}} // namespace Kokkos::Experimental //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- #endif diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp index c12019413b084cf92a043bd794f5be0ac209c77b..6060bf191fa35fab9642dc084a022e344c8c6928 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp @@ -50,7 +50,7 @@ #include <cstdio> #include <utility> -#include <impl/Kokkos_spinwait.hpp> +#include <impl/Kokkos_Spinwait.hpp> #include <impl/Kokkos_FunctorAdapter.hpp> #include <impl/Kokkos_HostThreadTeam.hpp> @@ -482,6 +482,8 @@ public: void next_static() { if ( m_league_rank < m_league_end ) { + // Make sure all stores are complete before entering the barrier + memory_fence(); team_barrier(); set_team_shared(); } @@ -518,6 +520,8 @@ public: return; if ( m_league_rank < m_league_chunk_end ) { + // Make sure all stores are complete before entering the barrier + memory_fence(); team_barrier(); set_team_shared(); } diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp index 0ee0cd3280a549447bc3cfa1b55be607eaafc3df..18ac7d26ad53b23ee75a9efead47cecad5bb6853 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp @@ -55,6 +55,8 @@ #include <impl/Kokkos_StaticAssert.hpp> #include <impl/Kokkos_FunctorAdapter.hpp> +#include <KokkosExp_MDRangePolicy.hpp> + //---------------------------------------------------------------------------- namespace Kokkos { @@ -174,6 +176,108 @@ public: {} }; + +// MDRangePolicy impl +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ... > + , Kokkos::Threads + > +{ +private: + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + + typedef typename MDRangePolicy::work_tag WorkTag ; + + typedef typename Policy::WorkRange WorkRange ; + typedef typename Policy::member_type Member ; + + typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; + + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor + + inline static + void + exec_range( const MDRangePolicy & mdr_policy + , const FunctorType & functor + , const Member ibeg , const Member iend ) + { + #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \ + defined( KOKKOS_ENABLE_PRAGMA_IVDEP ) + #pragma ivdep + #endif + for ( Member i = ibeg ; i < iend ; ++i ) { + iterate_type( mdr_policy, functor )( i ); + } + } + + static void exec( ThreadsExec & exec , const void * arg ) + { + exec_schedule<typename Policy::schedule_type::type>(exec,arg); + } + + template<class Schedule> + static + typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type + exec_schedule( ThreadsExec & exec , const void * arg ) + { + const ParallelFor & self = * ((const ParallelFor *) arg ); + + WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() ); + + ParallelFor::exec_range + ( self.m_mdr_policy, self.m_functor , range.begin() , range.end() ); + + exec.fan_in(); + } + + template<class Schedule> + static + typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type + exec_schedule( ThreadsExec & exec , const void * arg ) + { + const ParallelFor & self = * ((const ParallelFor *) arg ); + + WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() ); + + exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size()); + exec.reset_steal_target(); + exec.barrier(); + + long work_index = exec.get_work_index(); + + while(work_index != -1) { + const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size(); + const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end(); + + ParallelFor::exec_range + ( self.m_mdr_policy, self.m_functor , begin , end ); + work_index = exec.get_work_index(); + } + + exec.fan_in(); + } + +public: + + inline + void execute() const + { + ThreadsExec::start( & ParallelFor::exec , this ); + ThreadsExec::fence(); + } + + ParallelFor( const FunctorType & arg_functor + , const MDRangePolicy & arg_policy ) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + {} +}; + //---------------------------------------------------------------------------- /* ParallelFor Kokkos::Threads with TeamPolicy */ @@ -440,6 +544,169 @@ public: }; + +// MDRangePolicy impl +template< class FunctorType , class ReducerType, class ... Traits > +class ParallelReduce< FunctorType + , Kokkos::Experimental::MDRangePolicy< Traits ... > + , ReducerType + , Kokkos::Threads + > +{ +private: + + typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef typename MDRangePolicy::impl_range_policy Policy ; + + typedef typename MDRangePolicy::work_tag WorkTag ; + typedef typename Policy::WorkRange WorkRange ; + typedef typename Policy::member_type Member ; + + typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + + typedef typename ReducerTypeFwd::value_type ValueType; + + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy + , FunctorType + , WorkTag + , ValueType + >; + + const FunctorType m_functor ; + const MDRangePolicy m_mdr_policy ; + const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor + const ReducerType m_reducer ; + const pointer_type m_result_ptr ; + + inline static + void + exec_range( const MDRangePolicy & mdr_policy + , const FunctorType & functor + , const Member & ibeg , const Member & iend + , reference_type update ) + { + #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \ + defined( KOKKOS_ENABLE_PRAGMA_IVDEP ) + #pragma ivdep + #endif + for ( Member i = ibeg ; i < iend ; ++i ) { + iterate_type( mdr_policy, functor, update )( i ); + } + } + + static void + exec( ThreadsExec & exec , const void * arg ) { + exec_schedule<typename Policy::schedule_type::type>(exec, arg); + } + + template<class Schedule> + static + typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type + exec_schedule( ThreadsExec & exec , const void * arg ) + { + const ParallelReduce & self = * ((const ParallelReduce *) arg ); + const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() ); + + ParallelReduce::exec_range + ( self.m_mdr_policy, self.m_functor , range.begin() , range.end() + , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) ); + + exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); + } + + template<class Schedule> + static + typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type + exec_schedule( ThreadsExec & exec , const void * arg ) + { + const ParallelReduce & self = * ((const ParallelReduce *) arg ); + const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() ); + + exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size()); + exec.reset_steal_target(); + exec.barrier(); + + long work_index = exec.get_work_index(); + reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ); + while(work_index != -1) { + const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size(); + const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end(); + ParallelReduce::exec_range + ( self.m_mdr_policy, self.m_functor , begin , end + , update ); + work_index = exec.get_work_index(); + } + + exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); + } + +public: + + inline + void execute() const + { + ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 ); + + ThreadsExec::start( & ParallelReduce::exec , this ); + + ThreadsExec::fence(); + + if ( m_result_ptr ) { + + const pointer_type data = + (pointer_type) ThreadsExec::root_reduce_scratch(); + + const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); + for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; } + } + } + + template< class HostViewType > + ParallelReduce( const FunctorType & arg_functor , + const MDRangePolicy & arg_policy , + const HostViewType & arg_result_view , + typename std::enable_if< + Kokkos::is_view< HostViewType >::value && + !Kokkos::is_reducer_type<ReducerType>::value + ,void*>::type = NULL) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( InvalidType() ) + , m_result_ptr( arg_result_view.ptr_on_device() ) + { + static_assert( Kokkos::is_view< HostViewType >::value + , "Kokkos::Threads reduce result must be a View" ); + + static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value + , "Kokkos::Threads reduce result must be a View in HostSpace" ); + } + + inline + ParallelReduce( const FunctorType & arg_functor + , MDRangePolicy arg_policy + , const ReducerType& reducer ) + : m_functor( arg_functor ) + , m_mdr_policy( arg_policy ) + , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) ) + , m_reducer( reducer ) + , m_result_ptr( reducer.view().data() ) + { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/ + } + +}; + + //---------------------------------------------------------------------------- /* ParallelReduce with Kokkos::Threads and TeamPolicy */ diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..be904a167032f9bf8396b665e45258f79a824143 --- /dev/null +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -0,0 +1,115 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADS_WORKGRAPHPOLICY_HPP +#define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType , + Kokkos::Experimental::WorkGraphPolicy< Traits ... > , + Kokkos::Threads + > + : public Kokkos::Impl::Experimental:: + WorkGraphExec< FunctorType, + Kokkos::Threads, + Traits ... + > +{ +private: + + typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ; + typedef Kokkos::Impl::Experimental:: + WorkGraphExec<FunctorType, Kokkos::Threads, Traits ... > Base ; + typedef ParallelFor<FunctorType, + Kokkos::Experimental::WorkGraphPolicy<Traits ...>, + Kokkos::Threads> Self ; + + template< class TagType > + typename std::enable_if< std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + Base::m_functor( i ); + } + + template< class TagType > + typename std::enable_if< ! std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + const TagType t{} ; + Base::m_functor( t , i ); + } + + inline void exec_one_thread() const { + for (std::int32_t i; (-1 != (i = Base::before_work())); ) { + exec_one< typename Policy::work_tag >( i ); + Base::after_work(i); + } + } + + static inline void thread_main( ThreadsExec&, const void* arg ) { + const Self& self = *(static_cast<const Self*>(arg)); + self.exec_one_thread(); + } + +public: + + inline + void execute() + { + ThreadsExec::start( & Self::thread_main, this ); + ThreadsExec::fence(); + } + + inline + ParallelFor( const FunctorType & arg_functor + , const Policy & arg_policy ) + : Base( arg_functor, arg_policy ) + { + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP */ diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp index 77a1e8754da77dd2146635c285b6162f8aca3e5f..0171b209e5c1ddd5e4b2e03aaff78759a34335b6 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -141,7 +141,6 @@ namespace Kokkos { namespace Experimental { namespace Impl { #define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7] - // New Loop Macros... // parallel_for, non-tagged #define APPLY( func, ... ) \ @@ -1010,8 +1009,6 @@ namespace Kokkos { namespace Experimental { namespace Impl { // end tagged macros - - // Structs for calling loops template < int Rank, bool IsLeft, typename IType, typename Tagged, typename Enable = void > struct Tile_Loop_Type; @@ -1279,6 +1276,19 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i template <typename T> using is_void = std::is_same< T , void >; +template <typename T> +struct is_type_array : std::false_type +{ + using value_type = T; +}; + +template <typename T> +struct is_type_array< T[] > : std::true_type +{ + using value_type = T; +}; + + template < typename RP , typename Functor , typename Tag = void @@ -1761,18 +1771,17 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i RP const& m_rp; Functor const& m_func; typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag; -// value_type & m_v; - }; -// ValueType: For reductions +// For ParallelReduce +// ValueType - scalar: For reductions template < typename RP , typename Functor , typename Tag , typename ValueType > -struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value >::type > +struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && !is_type_array<ValueType>::value >::type > { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2251,102 +2260,635 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i }; -// ------------------------------------------------------------------ // - -// MDFunctor - wraps the range_policy and functor to pass to IterateTile -// Serial, Threads, OpenMP -// Cuda uses DeviceIterateTile directly within md_parallel_for -// ParallelReduce -template < typename MDRange, typename Functor, typename ValueType = void > -struct MDFunctor +// For ParallelReduce +// Extra specialization for array reductions +// ValueType[]: For array reductions +template < typename RP + , typename Functor + , typename Tag + , typename ValueType + > +struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && is_type_array<ValueType>::value >::type > { - using range_policy = MDRange; - using functor_type = Functor; - using value_type = ValueType; - using work_tag = typename range_policy::work_tag; - using index_type = typename range_policy::index_type; - using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange - , Functor - , work_tag - , value_type - >; + using index_type = typename RP::index_type; + using point_type = typename RP::point_type; + using value_type = typename is_type_array<ValueType>::value_type; // strip away the 'array-ness' [], only underlying type remains inline - MDFunctor( MDRange const& range, Functor const& f, ValueType & v ) - : m_range( range ) - , m_func( f ) + HostIterateTile( RP const& rp, Functor const& func, value_type *v ) // v should be an array; treat as pointer for compatibility since size is not known nor needed here + : m_rp(rp) //Cuda 7.0 does not like braces... + , m_func(func) + , m_v(v) // use with non-void ValueType struct {} inline - MDFunctor( MDFunctor const& ) = default; + bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const { + bool is_full_tile = true; - inline - MDFunctor& operator=( MDFunctor const& ) = default; + for ( int i = 0; i < RP::rank; ++i ) { + if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) { + partial_tile[i] = m_rp.m_tile[i] ; + } + else { + is_full_tile = false ; + partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range + } + } - inline - MDFunctor( MDFunctor && ) = default; + return is_full_tile ; + } // end check bounds - inline - MDFunctor& operator=( MDFunctor && ) = default; -// KOKKOS_FORCEINLINE_FUNCTION //Caused cuda warning - __host__ warning + template <int Rank> + struct RankTag + { + typedef RankTag type; + enum { value = (int)Rank }; + }; + + +#if KOKKOS_ENABLE_NEW_LOOP_MACROS + template <typename IType> inline - void operator()(index_type t, value_type & v) const + void + operator()(IType tile_idx) const { - iterate_type(m_range, m_func, v)(t); - } + point_type m_offset; + point_type m_tiledims; - MDRange m_range; - Functor m_func; -}; + if (RP::outer_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } -// ParallelFor -template < typename MDRange, typename Functor > -struct MDFunctor< MDRange, Functor, void > -{ - using range_policy = MDRange; - using functor_type = Functor; - using work_tag = typename range_policy::work_tag; - using index_type = typename range_policy::index_type; - using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange - , Functor - , work_tag - , void - >; + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims ); - inline - MDFunctor( MDRange const& range, Functor const& f ) - : m_range( range ) - , m_func( f ) - {} + } +#else + template <typename IType> inline - MDFunctor( MDFunctor const& ) = default; + void + operator()(IType tile_idx) const + { operator_impl( tile_idx , RankTag<RP::rank>() ); } + // added due to compiler error when using sfinae to choose operator based on rank - inline - MDFunctor& operator=( MDFunctor const& ) = default; + template <typename IType> inline - MDFunctor( MDFunctor && ) = default; + void operator_impl( IType tile_idx , const RankTag<2> ) const + { + point_type m_offset; + point_type m_tiledims; - inline - MDFunctor& operator=( MDFunctor && ) = default; + if (RP::outer_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_2L(index_type, m_tiledims) { + apply( LOOP_ARGS_2 ); + } + } else { +// #pragma simd + LOOP_2L(index_type, m_tiledims) { + apply( LOOP_ARGS_2 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_2R(index_type, m_tiledims) { + apply( LOOP_ARGS_2 ); + } + } else { +// #pragma simd + LOOP_2R(index_type, m_tiledims) { + apply( LOOP_ARGS_2 ); + } + } + } // end RP::Right + } //end op() rank == 2 + + + template <typename IType> inline - void operator()(index_type t) const + void operator_impl( IType tile_idx , const RankTag<3> ) const { - iterate_type(m_range, m_func)(t); - } + point_type m_offset; + point_type m_tiledims; - MDRange m_range; - Functor m_func; -}; + if (RP::outer_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } -#undef KOKKOS_ENABLE_NEW_LOOP_MACROS + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; -} } } //end namespace Kokkos::Experimental::Impl + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_3L(index_type, m_tiledims) { + apply( LOOP_ARGS_3 ); + } + } else { +// #pragma simd + LOOP_3L(index_type, m_tiledims) { + apply( LOOP_ARGS_3 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_3R(index_type, m_tiledims) { + apply( LOOP_ARGS_3 ); + } + } else { +// #pragma simd + LOOP_3R(index_type, m_tiledims) { + apply( LOOP_ARGS_3 ); + } + } + } // end RP::Right -#endif + } //end op() rank == 3 + + + template <typename IType> + inline + void operator_impl( IType tile_idx , const RankTag<4> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_4L(index_type, m_tiledims) { + apply( LOOP_ARGS_4 ); + } + } else { +// #pragma simd + LOOP_4L(index_type, m_tiledims) { + apply( LOOP_ARGS_4 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_4R(index_type, m_tiledims) { + apply( LOOP_ARGS_4 ); + } + } else { +// #pragma simd + LOOP_4R(index_type, m_tiledims) { + apply( LOOP_ARGS_4 ); + } + } + } // end RP::Right + + } //end op() rank == 4 + + + template <typename IType> + inline + void operator_impl( IType tile_idx , const RankTag<5> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_5L(index_type, m_tiledims) { + apply( LOOP_ARGS_5 ); + } + } else { +// #pragma simd + LOOP_5L(index_type, m_tiledims) { + apply( LOOP_ARGS_5 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_5R(index_type, m_tiledims) { + apply( LOOP_ARGS_5 ); + } + } else { +// #pragma simd + LOOP_5R(index_type, m_tiledims) { + apply( LOOP_ARGS_5 ); + } + } + } // end RP::Right + + } //end op() rank == 5 + + + template <typename IType> + inline + void operator_impl( IType tile_idx , const RankTag<6> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_6L(index_type, m_tiledims) { + apply( LOOP_ARGS_6 ); + } + } else { +// #pragma simd + LOOP_6L(index_type, m_tiledims) { + apply( LOOP_ARGS_6 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_6R(index_type, m_tiledims) { + apply( LOOP_ARGS_6 ); + } + } else { +// #pragma simd + LOOP_6R(index_type, m_tiledims) { + apply( LOOP_ARGS_6 ); + } + } + } // end RP::Right + + } //end op() rank == 6 + + + template <typename IType> + inline + void operator_impl( IType tile_idx , const RankTag<7> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_7L(index_type, m_tiledims) { + apply( LOOP_ARGS_7 ); + } + } else { +// #pragma simd + LOOP_7L(index_type, m_tiledims) { + apply( LOOP_ARGS_7 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_7R(index_type, m_tiledims) { + apply( LOOP_ARGS_7 ); + } + } else { +// #pragma simd + LOOP_7R(index_type, m_tiledims) { + apply( LOOP_ARGS_7 ); + } + } + } // end RP::Right + + } //end op() rank == 7 + + + template <typename IType> + inline + void operator_impl( IType tile_idx , const RankTag<8> ) const + { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == RP::Left) { + for (int i=0; i<RP::rank; ++i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + else { + for (int i=RP::rank-1; i>=0; --i) { + m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; + + if (RP::inner_direction == RP::Left) { + if ( full_tile ) { +// #pragma simd + LOOP_8L(index_type, m_tiledims) { + apply( LOOP_ARGS_8 ); + } + } else { +// #pragma simd + LOOP_8L(index_type, m_tiledims) { + apply( LOOP_ARGS_8 ); + } + } + } // end RP::Left + else { + if ( full_tile ) { +// #pragma simd + LOOP_8R(index_type, m_tiledims) { + apply( LOOP_ARGS_8 ); + } + } else { +// #pragma simd + LOOP_8R(index_type, m_tiledims) { + apply( LOOP_ARGS_8 ); + } + } + } // end RP::Right + + } //end op() rank == 8 +#endif + + + template <typename... Args> + typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type + apply(Args &&... args) const + { + m_func(args... , m_v); + } + + template <typename... Args> + typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type + apply(Args &&... args) const + { + m_func( m_tag, args... , m_v); + } + + + RP const& m_rp; + Functor const& m_func; + value_type * m_v; + typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag; + +}; + + +// ------------------------------------------------------------------ // + +// MDFunctor - wraps the range_policy and functor to pass to IterateTile +// Used for md_parallel_{for,reduce} with Serial, Threads, OpenMP +// Cuda uses DeviceIterateTile directly within md_parallel_for +// TODO Once md_parallel_{for,reduce} removed, this can be removed + +// ParallelReduce - scalar reductions +template < typename MDRange, typename Functor, typename ValueType = void > +struct MDFunctor +{ + using range_policy = MDRange; + using functor_type = Functor; + using value_type = ValueType; + using work_tag = typename range_policy::work_tag; + using index_type = typename range_policy::index_type; + using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange + , Functor + , work_tag + , value_type + >; + + + inline + MDFunctor( MDRange const& range, Functor const& f ) + : m_range( range ) + , m_func( f ) + {} + + inline + MDFunctor( MDFunctor const& ) = default; + + inline + MDFunctor& operator=( MDFunctor const& ) = default; + + inline + MDFunctor( MDFunctor && ) = default; + + inline + MDFunctor& operator=( MDFunctor && ) = default; + + inline + void operator()(index_type t, value_type & v) const + { + iterate_type(m_range, m_func, v)(t); + } + + MDRange m_range; + Functor m_func; +}; + + +// ParallelReduce - array reductions +template < typename MDRange, typename Functor, typename ValueType > +struct MDFunctor< MDRange, Functor, ValueType[] > +{ + using range_policy = MDRange; + using functor_type = Functor; + using value_type = ValueType[]; + using work_tag = typename range_policy::work_tag; + using index_type = typename range_policy::index_type; + using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange + , Functor + , work_tag + , value_type + >; + + + inline + MDFunctor( MDRange const& range, Functor const& f ) + : m_range( range ) + , m_func( f ) + , value_count( f.value_count ) + {} + + inline + MDFunctor( MDFunctor const& ) = default; + + inline + MDFunctor& operator=( MDFunctor const& ) = default; + + inline + MDFunctor( MDFunctor && ) = default; + + inline + MDFunctor& operator=( MDFunctor && ) = default; + + // FIXME Init and Join, as defined in m_func, are not working through the MDFunctor + // Best path forward is to eliminate need for MDFunctor, directly use MDRangePolicy within Parallel{For,Reduce} ?? + inline + void operator()(index_type t, value_type v) const + { + iterate_type(m_range, m_func, v)(t); + } + + MDRange m_range; + Functor m_func; + size_t value_count; +}; + + +// ParallelFor +template < typename MDRange, typename Functor > +struct MDFunctor< MDRange, Functor, void > +{ + using range_policy = MDRange; + using functor_type = Functor; + using work_tag = typename range_policy::work_tag; + using index_type = typename range_policy::index_type; + using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange + , Functor + , work_tag + , void + >; + + + inline + MDFunctor( MDRange const& range, Functor const& f ) + : m_range( range ) + , m_func( f ) + {} + + inline + MDFunctor( MDFunctor const& ) = default; + + inline + MDFunctor& operator=( MDFunctor const& ) = default; + + inline + MDFunctor( MDFunctor && ) = default; + + inline + MDFunctor& operator=( MDFunctor && ) = default; + + inline + void operator()(index_type t) const + { + iterate_type(m_range, m_func)(t); + } + + MDRange m_range; + Functor m_func; +}; + +#undef KOKKOS_ENABLE_NEW_LOOP_MACROS + +} } } //end namespace Kokkos::Experimental::Impl + +#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp index c5685c5b624c6da721b3f91877e655c582f36d2d..3fb15c8d1efa639ec2483cb2208d8cf5835f6552 100644 --- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp @@ -55,16 +55,19 @@ template < typename ExecutionSpace = void , typename WorkTag = void , typename IndexType = void , typename IterationPattern = void + , typename LaunchBounds = void > struct PolicyTraitsBase { - using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>; + using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, + IterationPattern, LaunchBounds>; using execution_space = ExecutionSpace; using schedule_type = Schedule; using work_tag = WorkTag; using index_type = IndexType; using iteration_pattern = IterationPattern; + using launch_bounds = LaunchBounds; }; @@ -78,6 +81,7 @@ struct SetExecutionSpace , typename PolicyBase::work_tag , typename PolicyBase::index_type , typename PolicyBase::iteration_pattern + , typename PolicyBase::launch_bounds >; }; @@ -91,6 +95,7 @@ struct SetSchedule , typename PolicyBase::work_tag , typename PolicyBase::index_type , typename PolicyBase::iteration_pattern + , typename PolicyBase::launch_bounds >; }; @@ -104,6 +109,7 @@ struct SetWorkTag , WorkTag , typename PolicyBase::index_type , typename PolicyBase::iteration_pattern + , typename PolicyBase::launch_bounds >; }; @@ -117,6 +123,7 @@ struct SetIndexType , typename PolicyBase::work_tag , IndexType , typename PolicyBase::iteration_pattern + , typename PolicyBase::launch_bounds >; }; @@ -131,6 +138,22 @@ struct SetIterationPattern , typename PolicyBase::work_tag , typename PolicyBase::index_type , IterationPattern + , typename PolicyBase::launch_bounds + >; +}; + + +template <typename PolicyBase, typename LaunchBounds> +struct SetLaunchBounds +{ + static_assert( is_void<typename PolicyBase::launch_bounds>::value + , "Kokkos Error: More than one launch_bounds given" ); + using type = PolicyTraitsBase< typename PolicyBase::execution_space + , typename PolicyBase::schedule_type + , typename PolicyBase::work_tag + , typename PolicyBase::index_type + , typename PolicyBase::iteration_pattern + , LaunchBounds >; }; @@ -146,8 +169,9 @@ struct AnalyzePolicy<Base, T, Traits...> : public , typename std::conditional< is_index_type<T>::value , SetIndexType<Base,T> , typename std::conditional< std::is_integral<T>::value , SetIndexType<Base, IndexType<T> > , typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T> + , typename std::conditional< is_launch_bounds<T>::value , SetLaunchBounds<Base,T> , SetWorkTag<Base,T> - >::type >::type >::type >::type>::type::type + >::type >::type >::type >::type >::type>::type::type , Traits... > {}; @@ -178,11 +202,18 @@ struct AnalyzePolicy<Base> , void // TODO set default iteration pattern , typename Base::iteration_pattern >::type; + + using launch_bounds = typename std::conditional< is_void< typename Base::launch_bounds >::value + , LaunchBounds<> + , typename Base::launch_bounds + >::type; + using type = PolicyTraitsBase< execution_space , schedule_type , work_tag , index_type , iteration_pattern + , launch_bounds >; }; diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp index 010b15064ee3ebed2cf92ac3e25ab312a3c279bc..5b894b037b83ce8095a78ad094f9c33ef1020d42 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include <xmmintrin.h> +#endif + #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP ) #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP @@ -126,11 +130,21 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare , inline int atomic_compare_exchange( volatile int * const dest, const int compare, const int val) -{ return __sync_val_compare_and_swap(dest,compare,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_val_compare_and_swap(dest,compare,val); +} inline long atomic_compare_exchange( volatile long * const dest, const long compare, const long val ) -{ return __sync_val_compare_and_swap(dest,compare,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_val_compare_and_swap(dest,compare,val); +} #if defined( KOKKOS_ENABLE_GNU_ATOMICS ) @@ -159,6 +173,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare, KOKKOS_INLINE_FUNCTION U() {}; } tmp ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) ); return tmp.t ; } @@ -175,6 +193,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare, KOKKOS_INLINE_FUNCTION U() {}; } tmp ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) ); return tmp.t ; } @@ -193,6 +215,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare, KOKKOS_INLINE_FUNCTION U() {}; } tmp ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) ); return tmp.t ; } @@ -209,6 +235,10 @@ T atomic_compare_exchange( volatile T * const dest , const T compare , #endif , const T >::type& val ) { +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + while( !Impl::lock_address_host_space( (void*) dest ) ); T return_val = *dest; if( return_val == compare ) { diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp index 127de528f5303a56da4c0eb25d1012181aa0e598..2a13a4865c7eaec2c24877e9511d79e823b261b7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include <xmmintrin.h> +#endif + #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP ) #define KOKKOS_ATOMIC_DECREMENT_HPP @@ -54,6 +58,10 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_decrement<char>(volatile char* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif + __asm__ __volatile__( "lock decb %0" : /* no output registers */ @@ -69,6 +77,10 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_decrement<short>(volatile short* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif + __asm__ __volatile__( "lock decw %0" : /* no output registers */ @@ -84,6 +96,10 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_decrement<int>(volatile int* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif + __asm__ __volatile__( "lock decl %0" : /* no output registers */ @@ -99,6 +115,9 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_decrement<long long int>(volatile long long int* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif __asm__ __volatile__( "lock decq %0" : /* no output registers */ diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp index a1ff47abce66737b6f3875a9e87019d428ab377d..9ba3cae9fca74fd8ad6dc4b69bd17b675897aac7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include <xmmintrin.h> +#endif + #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP ) #define KOKKOS_ATOMIC_EXCHANGE_HPP @@ -81,6 +85,10 @@ T atomic_exchange( typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val ) { // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) ); +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + int tmp = atomicExch( ((int*)dest) , *((int*)&val) ); return *((T*)&tmp); } @@ -93,6 +101,11 @@ T atomic_exchange( sizeof(T) == sizeof(unsigned long long int) , const T & >::type val ) { typedef unsigned long long int type ; + +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) ); type tmp = atomicExch( ((type*)dest) , *((type*)&val) ); return *((T*)&tmp); @@ -108,6 +121,10 @@ T atomic_exchange( volatile T * const dest , { T return_val; // This is a way to (hopefully) avoid dead lock in a warp +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; @@ -173,6 +190,9 @@ T atomic_exchange( volatile T * const dest , , const T & >::type val ) { typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif const type v = *((type*)&val); // Extract to be sure the value doesn't change @@ -201,6 +221,10 @@ T atomic_exchange( volatile T * const dest , typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t) , const T & >::type val ) { +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + union U { Impl::cas128_t i ; T t ; @@ -260,6 +284,10 @@ void atomic_assign( volatile T * const dest , { typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + const type v = *((type*)&val); // Extract to be sure the value doesn't change type assumed ; @@ -285,6 +313,10 @@ void atomic_assign( volatile T * const dest , typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t) , const T & >::type val ) { +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + union U { Impl::cas128_t i ; T t ; diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp index 860c8e0e4347d66f9fd2a9714c0effe0322e14f4..084c55efedd20b77dbd1fefa86e8f970a4302473 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include <xmmintrin.h> +#endif + #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP ) #define KOKKOS_ATOMIC_FETCH_ADD_HPP @@ -161,36 +165,60 @@ T atomic_fetch_add( volatile T * const dest , inline int atomic_fetch_add( volatile int * dest , const int val ) { - int original = val; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif - __asm__ __volatile__( - "lock xadd %1, %0" - : "+m" (*dest), "+r" (original) - : "m" (*dest), "r" (original) - : "memory" + int original = val; + + __asm__ __volatile__( + "lock xadd %1, %0" + : "+m" (*dest), "+r" (original) + : "m" (*dest), "r" (original) + : "memory" ); - return original; + return original; } #else inline int atomic_fetch_add( volatile int * const dest , const int val ) -{ return __sync_fetch_and_add(dest, val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_add(dest, val); +} #endif inline long int atomic_fetch_add( volatile long int * const dest , const long int val ) -{ return __sync_fetch_and_add(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_add(dest,val); +} #if defined( KOKKOS_ENABLE_GNU_ATOMICS ) inline unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val ) -{ return __sync_fetch_and_add(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_add(dest,val); +} inline unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val ) -{ return __sync_fetch_and_add(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_add(dest,val); +} #endif @@ -205,6 +233,10 @@ T atomic_fetch_add( volatile T * const dest , inline U() {}; } assume , oldval , newval ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + oldval.t = *dest ; do { @@ -228,6 +260,10 @@ T atomic_fetch_add( volatile T * const dest , inline U() {}; } assume , oldval , newval ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + oldval.t = *dest ; do { @@ -253,6 +289,10 @@ T atomic_fetch_add( volatile T * const dest , inline U() {}; } assume , oldval , newval ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + oldval.t = *dest ; do { diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp index 83f5b2a5aadb001115748209d9c098429fb1afff..6ecb65336c737214d364e8981bed85e4b6993191 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include <xmmintrin.h> +#endif + #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP ) #define KOKKOS_ATOMIC_FETCH_AND_HPP @@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_and( volatile unsigned long long int * const inline int atomic_fetch_and( volatile int * const dest , const int val ) -{ return __sync_fetch_and_and(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_and(dest,val); +} inline long int atomic_fetch_and( volatile long int * const dest , const long int val ) -{ return __sync_fetch_and_and(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_and(dest,val); +} #if defined( KOKKOS_ENABLE_GNU_ATOMICS ) inline unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val ) -{ return __sync_fetch_and_and(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_and(dest,val); +} inline unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val ) -{ return __sync_fetch_and_and(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_and(dest,val); +} #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp index 8c73b4c3ef3ec55c12efe1653385ddd47f1bbbdc..ed3b438f8957fd54d21aa7e91410696f061b049e 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include <xmmintrin.h> +#endif + #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP ) #define KOKKOS_ATOMIC_FETCH_OR_HPP @@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_or( volatile unsigned long long int * const inline int atomic_fetch_or( volatile int * const dest , const int val ) -{ return __sync_fetch_and_or(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_or(dest,val); +} inline long int atomic_fetch_or( volatile long int * const dest , const long int val ) -{ return __sync_fetch_and_or(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_or(dest,val); +} #if defined( KOKKOS_ENABLE_GNU_ATOMICS ) inline unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val ) -{ return __sync_fetch_and_or(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_or(dest,val); +} inline unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val ) -{ return __sync_fetch_and_or(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_or(dest,val); +} #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp index 504731d3a2f19529ec9be97b7709638ed38a9bb8..038cc13e9aa400d93679febb34370bb0fb729adc 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include <xmmintrin.h> +#endif + #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP ) #define KOKKOS_ATOMIC_FETCH_SUB_HPP @@ -136,21 +140,41 @@ T atomic_fetch_sub( volatile T * const dest , inline int atomic_fetch_sub( volatile int * const dest , const int val ) -{ return __sync_fetch_and_sub(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_sub(dest,val); +} inline long int atomic_fetch_sub( volatile long int * const dest , const long int val ) -{ return __sync_fetch_and_sub(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_sub(dest,val); +} #if defined( KOKKOS_ENABLE_GNU_ATOMICS ) inline unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val ) -{ return __sync_fetch_and_sub(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_sub(dest,val); +} inline unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val ) -{ return __sync_fetch_and_sub(dest,val); } +{ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + return __sync_fetch_and_sub(dest,val); +} #endif @@ -161,6 +185,10 @@ T atomic_fetch_sub( volatile T * const dest , { union { int i ; T t ; } assume , oldval , newval ; +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + oldval.t = *dest ; do { @@ -178,6 +206,10 @@ T atomic_fetch_sub( volatile T * const dest , typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && sizeof(T) == sizeof(long) , const T >::type val ) { +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + union { long i ; T t ; } assume , oldval , newval ; oldval.t = *dest ; @@ -202,6 +234,10 @@ T atomic_fetch_sub( volatile T * const dest , && ( sizeof(T) != 8 ) , const T >::type& val ) { +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); +#endif + while( !Impl::lock_address_host_space( (void*) dest ) ); T return_val = *dest; *dest = return_val - val; diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp index 2985fad95ebb29d434095d94cbf41fc2e7c3d3c2..e7626603fc66f89df8cc0a2d61581c5ef43deccb 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp @@ -41,6 +41,10 @@ //@HEADER */ +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) +#include <xmmintrin.h> +#endif + #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP ) #define KOKKOS_ATOMIC_INCREMENT_HPP @@ -52,6 +56,9 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_increment<char>(volatile char* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif __asm__ __volatile__( "lock incb %0" : /* no output registers */ @@ -67,6 +74,9 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_increment<short>(volatile short* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif __asm__ __volatile__( "lock incw %0" : /* no output registers */ @@ -82,6 +92,9 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_increment<int>(volatile int* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif __asm__ __volatile__( "lock incl %0" : /* no output registers */ @@ -97,6 +110,9 @@ template<> KOKKOS_INLINE_FUNCTION void atomic_increment<long long int>(volatile long long int* a) { #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__) +#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) + _mm_prefetch( (const char*) a, _MM_HINT_ET0 ); +#endif __asm__ __volatile__( "lock incq %0" : /* no output registers */ diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index f0ff6d78ec21134292d85c8fd230c549bb1a961c..f52cc469ace10cd795b3fa1389732e292f4357ee 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -87,17 +87,12 @@ setenv("MEMKIND_HBW_NODES", "1", 0); #if defined( KOKKOS_ENABLE_OPENMP ) if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value || std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) { - if(num_threads>0) { - if(use_numa>0) { - Kokkos::OpenMP::initialize(num_threads,use_numa); - } - else { - Kokkos::OpenMP::initialize(num_threads); - } - } else { - Kokkos::OpenMP::initialize(); + if(use_numa>0) { + Kokkos::OpenMP::initialize(num_threads,use_numa); + } + else { + Kokkos::OpenMP::initialize(num_threads); } - //std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ; } else { //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ; @@ -437,10 +432,7 @@ void initialize(int& narg, char* arg[]) iarg++; } - InitArguments arguments; - arguments.num_threads = num_threads; - arguments.num_numa = numa; - arguments.device_id = device; + InitArguments arguments{num_threads, numa, device}; Impl::initialize_internal(arguments); } diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp index dc75fb072f9d1c074824331b02af6b7f0fae3706..fccd8e090fc42b66afc6d14d64abe8273f9a258c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp @@ -170,28 +170,31 @@ struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType: static_assert( 0 == ( sizeof(value_type) % sizeof(int) ) , "Reduction functor's declared value_type requires: 0 == sizeof(value_type) % sizeof(int)" ); + /* this cast to bool is needed for correctness by NVCC */ + enum : bool { IsArray = static_cast<bool>(Impl::is_array< typename FunctorType::value_type >::value) }; + // If not an array then what is the sizeof(value_type) - enum { StaticValueSize = Impl::is_array< typename FunctorType::value_type >::value ? 0 : sizeof(value_type) }; + enum { StaticValueSize = IsArray ? 0 : sizeof(value_type) }; typedef value_type * pointer_type ; // The reference_type for an array is 'value_type *' // The reference_type for a single value is 'value_type &' - typedef typename Impl::if_c< ! StaticValueSize , value_type * - , value_type & >::type reference_type ; + typedef typename Impl::if_c< IsArray , value_type * + , value_type & >::type reference_type ; // Number of values if single value template< class F > KOKKOS_FORCEINLINE_FUNCTION static - typename Impl::enable_if< std::is_same<F,FunctorType>::value && StaticValueSize , unsigned >::type + typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! IsArray , unsigned >::type value_count( const F & ) { return 1 ; } // Number of values if an array, protect via templating because 'f.value_count' // will only exist when the functor declares the value_type to be an array. template< class F > KOKKOS_FORCEINLINE_FUNCTION static - typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! StaticValueSize , unsigned >::type + typename Impl::enable_if< std::is_same<F,FunctorType>::value && IsArray , unsigned >::type value_count( const F & f ) { return f.value_count ; } // Total size of the value diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp index 8cb7430035885a0c779630c212516f4c39256de2..e11f8b6d346491f75fe0e18f0bda85385233907e 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp @@ -70,62 +70,6 @@ #ifdef KOKKOS_ENABLE_HBWSPACE #define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB) -namespace Kokkos { -namespace Experimental { -namespace { - -static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ; - -typedef int (* QuerySpaceInParallelPtr )(); - -QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ; -int s_in_parallel_query_count = 0 ; - -} // namespace <empty> - -void HBWSpace::register_in_parallel( int (*device_in_parallel)() ) -{ - if ( 0 == device_in_parallel ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel ERROR : given NULL" ) ); - } - - int i = -1 ; - - if ( ! (device_in_parallel)() ) { - for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i ); - } - - if ( i < s_in_parallel_query_count ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : called in_parallel" ) ); - - } - - if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : exceeded maximum" ) ); - - } - - for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i ); - - if ( i == s_in_parallel_query_count ) { - s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ; - } -} - -int HBWSpace::in_parallel() -{ - const int n = s_in_parallel_query_count ; - - int i = 0 ; - - while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; } - - return i < n ; -} - -} // namespace Experiemtal -} // namespace Kokkos - /*--------------------------------------------------------------------------*/ namespace Kokkos { diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp index 2a5c34c375c5b0c83f9684cf14e3512a8e1864b8..a5a73ddebbc68f8e0af6382c2628f6b310045f07 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp @@ -106,62 +106,6 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { -namespace { - -static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ; - -typedef int (* QuerySpaceInParallelPtr )(); - -QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ; -int s_in_parallel_query_count = 0 ; - -} // namespace <empty> - -void HostSpace::register_in_parallel( int (*device_in_parallel)() ) -{ - if ( 0 == device_in_parallel ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) ); - } - - int i = -1 ; - - if ( ! (device_in_parallel)() ) { - for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i ); - } - - if ( i < s_in_parallel_query_count ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) ); - - } - - if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) ); - - } - - for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i ); - - if ( i == s_in_parallel_query_count ) { - s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ; - } -} - -int HostSpace::in_parallel() -{ - const int n = s_in_parallel_query_count ; - - int i = 0 ; - - while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; } - - return i < n ; -} - -} // namespace Kokkos - -/*--------------------------------------------------------------------------*/ - namespace Kokkos { /* Default allocation mechanism */ @@ -340,9 +284,6 @@ void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_ } } -constexpr const char* HostSpace::name() { - return m_name; -} } // namespace Kokkos //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp index ac200209c72bca381f60b9564944bc444748f0fb..d2446bde09ad7697fb23d11d469fe1566dde8fac 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp @@ -45,7 +45,7 @@ #include <Kokkos_Macros.hpp> #include <impl/Kokkos_HostThreadTeam.hpp> #include <impl/Kokkos_Error.hpp> -#include <impl/Kokkos_spinwait.hpp> +#include <impl/Kokkos_Spinwait.hpp> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -58,9 +58,11 @@ void HostThreadTeamData::organize_pool { bool ok = true ; + memory_fence(); + // Verify not already a member of a pool: for ( int rank = 0 ; rank < size && ok ; ++rank ) { - ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch ); + ok = ( nullptr != members[rank] ) && ( 0 == members[rank]->m_pool_scratch ); } if ( ok ) { @@ -89,7 +91,6 @@ void HostThreadTeamData::organize_pool mem->m_team_alloc = 1 ; mem->m_league_rank = rank ; mem->m_league_size = size ; - mem->m_pool_rendezvous_step = 0 ; mem->m_team_rendezvous_step = 0 ; pool[ rank ] = mem ; } @@ -116,7 +117,6 @@ void HostThreadTeamData::disband_pool() m_team_alloc = 1 ; m_league_rank = 0 ; m_league_size = 1 ; - m_pool_rendezvous_step = 0 ; m_team_rendezvous_step = 0 ; } @@ -256,11 +256,6 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ; - union { - int64_t full ; - int8_t byte[8] ; - } value ; - if ( rank ) { const int group_begin = rank << shift_byte ; // == rank * size_byte @@ -275,13 +270,14 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer const int end = group_begin + size_byte < size ? size_byte : size - group_begin ; - value.full = 0 ; - for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step ); + int64_t value = 0 ; - store_fence(); // This should not be needed but fixes #742 + for ( int i = 0 ; i < end ; ++i ) { + ((int8_t*) & value )[i] = int8_t( step ); + } spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ] - , value.full ); + , value ); } { @@ -316,10 +312,12 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer const int end = size_byte < size ? 8 : size ; - value.full = 0 ; - for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step ); + int64_t value = 0 ; + for ( int i = 1 ; i < end ; ++i ) { + ((int8_t *) & value)[i] = int8_t( step ); + } - spinwait_until_equal( buffer[ sync_offset ], value.full ); + spinwait_until_equal( buffer[ sync_offset ], value ); } return rank ? 0 : 1 ; diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp index c050a16eaee005f618b252f458732dc24cad1f64..7facc0a4106737363851a11a3f2b6c7e34348b6c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -50,6 +50,7 @@ #include <Kokkos_ExecPolicy.hpp> #include <impl/Kokkos_FunctorAdapter.hpp> #include <impl/Kokkos_FunctorAnalysis.hpp> +#include <impl/Kokkos_Rendezvous.hpp> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -67,14 +68,12 @@ public: // Assume upper bounds on number of threads: // pool size <= 1024 threads - // pool rendezvous <= ( 1024 / 8 ) * 4 + 4 = 2052 // team size <= 64 threads - // team rendezvous <= ( 64 / 8 ) * 4 + 4 = 36 enum : int { max_pool_members = 1024 }; enum : int { max_team_members = 64 }; - enum : int { max_pool_rendezvous = ( max_pool_members / 8 ) * 4 + 4 }; - enum : int { max_team_rendezvous = ( max_team_members / 8 ) * 4 + 4 }; + enum : int { max_pool_rendezvous = rendezvous_buffer_size( max_pool_members ) }; + enum : int { max_team_rendezvous = rendezvous_buffer_size( max_team_members ) }; private: @@ -114,7 +113,6 @@ private: int m_league_size ; int m_work_chunk ; int m_steal_rank ; // work stealing rank - int mutable m_pool_rendezvous_step ; int mutable m_team_rendezvous_step ; HostThreadTeamData * team_member( int r ) const noexcept @@ -147,6 +145,7 @@ public: int team_rendezvous( int const root ) const noexcept { return 1 == m_team_size ? 1 : + HostThreadTeamData:: rendezvous( m_team_scratch + m_team_rendezvous , m_team_rendezvous_step , m_team_size @@ -157,6 +156,7 @@ public: int team_rendezvous() const noexcept { return 1 == m_team_size ? 1 : + HostThreadTeamData:: rendezvous( m_team_scratch + m_team_rendezvous , m_team_rendezvous_step , m_team_size @@ -167,6 +167,7 @@ public: void team_rendezvous_release() const noexcept { if ( 1 < m_team_size ) { + HostThreadTeamData:: rendezvous_release( m_team_scratch + m_team_rendezvous , m_team_rendezvous_step ); } @@ -175,19 +176,30 @@ public: inline int pool_rendezvous() const noexcept { + static constexpr int yield_wait = + #if defined( KOKKOS_COMPILER_IBM ) + // If running on IBM POWER architecture the global + // level rendzvous should immediately yield when + // waiting for other threads in the pool to arrive. + 1 + #else + 0 + #endif + ; return 1 == m_pool_size ? 1 : + Kokkos::Impl:: rendezvous( m_pool_scratch + m_pool_rendezvous - , m_pool_rendezvous_step , m_pool_size - , m_pool_rank ); + , m_pool_rank + , yield_wait ); } inline void pool_rendezvous_release() const noexcept { if ( 1 < m_pool_size ) { - rendezvous_release( m_pool_scratch + m_pool_rendezvous - , m_pool_rendezvous_step ); + Kokkos::Impl:: + rendezvous_release( m_pool_scratch + m_pool_rendezvous ); } } @@ -213,7 +225,6 @@ public: , m_league_size(1) , m_work_chunk(0) , m_steal_rank(0) - , m_pool_rendezvous_step(0) , m_team_rendezvous_step(0) {} @@ -406,7 +417,7 @@ fflush(stdout); // Steal from next team, round robin // The next team is offset by m_team_alloc if it fits in the pool. - m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ? + m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ? m_team_base + m_team_alloc : 0 ; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp index 98482cfab6d8a4139c9162ab48c2993021c8e141..608d514c79eb48ecb7cda69ca4b424eeebdeb0ac 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp @@ -50,51 +50,70 @@ namespace Kokkos { namespace Profiling { +static initFunction initProfileLibrary = nullptr; +static finalizeFunction finalizeProfileLibrary = nullptr; + +static beginFunction beginForCallee = nullptr; +static beginFunction beginScanCallee = nullptr; +static beginFunction beginReduceCallee = nullptr; +static endFunction endForCallee = nullptr; +static endFunction endScanCallee = nullptr; +static endFunction endReduceCallee = nullptr; + +static pushFunction pushRegionCallee = nullptr; +static popFunction popRegionCallee = nullptr; + +static allocateDataFunction allocateDataCallee = nullptr; +static deallocateDataFunction deallocateDataCallee = nullptr; + +static beginDeepCopyFunction beginDeepCopyCallee = nullptr; +static endDeepCopyFunction endDeepCopyCallee = nullptr; + SpaceHandle::SpaceHandle(const char* space_name) { strncpy(name,space_name,64); } bool profileLibraryLoaded() { - return (NULL != initProfileLibrary); + return (nullptr != initProfileLibrary); } void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) { - if(NULL != beginForCallee) { + if(nullptr != beginForCallee) { Kokkos::fence(); (*beginForCallee)(kernelPrefix.c_str(), devID, kernelID); } } void endParallelFor(const uint64_t kernelID) { - if(NULL != endForCallee) { + if(nullptr != endForCallee) { Kokkos::fence(); (*endForCallee)(kernelID); } } void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) { - if(NULL != beginScanCallee) { + if(nullptr != beginScanCallee) { Kokkos::fence(); (*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID); } } void endParallelScan(const uint64_t kernelID) { - if(NULL != endScanCallee) { + if(nullptr != endScanCallee) { Kokkos::fence(); (*endScanCallee)(kernelID); } } void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) { - if(NULL != beginReduceCallee) { + if(nullptr != beginReduceCallee) { Kokkos::fence(); (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID); } } void endParallelReduce(const uint64_t kernelID) { - if(NULL != endReduceCallee) { + if(nullptr != endReduceCallee) { Kokkos::fence(); (*endReduceCallee)(kernelID); } @@ -102,31 +121,47 @@ void endParallelReduce(const uint64_t kernelID) { void pushRegion(const std::string& kName) { - if( NULL != pushRegionCallee ) { + if( nullptr != pushRegionCallee ) { Kokkos::fence(); (*pushRegionCallee)(kName.c_str()); } } void popRegion() { - if( NULL != popRegionCallee ) { + if( nullptr != popRegionCallee ) { Kokkos::fence(); (*popRegionCallee)(); } } void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) { - if(NULL != allocateDataCallee) { + if(nullptr != allocateDataCallee) { (*allocateDataCallee)(space,label.c_str(),ptr,size); } } void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) { - if(NULL != allocateDataCallee) { + if(nullptr != deallocateDataCallee) { (*deallocateDataCallee)(space,label.c_str(),ptr,size); } } +void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr, + const SpaceHandle src_space, const std::string src_label, const void* src_ptr, + const uint64_t size) { + if(nullptr != beginDeepCopyCallee) { + (*beginDeepCopyCallee)(dst_space, dst_label.c_str(), dst_ptr, + src_space, src_label.c_str(), src_ptr, + size); + } +} + +void endDeepCopy() { + if(nullptr != endDeepCopyCallee) { + (*endDeepCopyCallee)(); + } +} + void initialize() { // Make sure initialize calls happens only once @@ -140,7 +175,7 @@ void initialize() { // If we do not find a profiling library in the environment then exit // early. - if( NULL == envProfileLibrary ) { + if( nullptr == envProfileLibrary ) { return ; } @@ -149,10 +184,10 @@ void initialize() { char* profileLibraryName = strtok(envProfileCopy, ";"); - if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) { + if( (nullptr != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) { firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL); - if(NULL == firstProfileLibrary) { + if(nullptr == firstProfileLibrary) { std::cerr << "Error: Unable to load KokkosP library: " << profileLibraryName << std::endl; } else { @@ -191,14 +226,19 @@ void initialize() { auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data"); deallocateDataCallee = *((deallocateDataFunction*) &p12); + auto p13 = dlsym(firstProfileLibrary, "kokkosp_begin_deep_copy"); + beginDeepCopyCallee = *((beginDeepCopyFunction*) &p13); + auto p14 = dlsym(firstProfileLibrary, "kokkosp_end_deep_copy"); + endDeepCopyCallee = *((endDeepCopyFunction*) &p14); + } } - if(NULL != initProfileLibrary) { + if(nullptr != initProfileLibrary) { (*initProfileLibrary)(0, (uint64_t) KOKKOSP_INTERFACE_VERSION, (uint32_t) 0, - NULL); + nullptr); } free(envProfileCopy); @@ -210,28 +250,30 @@ void finalize() { if(is_finalized) return; is_finalized = 1; - if(NULL != finalizeProfileLibrary) { + if(nullptr != finalizeProfileLibrary) { (*finalizeProfileLibrary)(); - // Set all profile hooks to NULL to prevent + // Set all profile hooks to nullptr to prevent // any additional calls. Once we are told to // finalize, we mean it - initProfileLibrary = NULL; - finalizeProfileLibrary = NULL; + initProfileLibrary = nullptr; + finalizeProfileLibrary = nullptr; - beginForCallee = NULL; - beginScanCallee = NULL; - beginReduceCallee = NULL; - endScanCallee = NULL; - endForCallee = NULL; - endReduceCallee = NULL; + beginForCallee = nullptr; + beginScanCallee = nullptr; + beginReduceCallee = nullptr; + endScanCallee = nullptr; + endForCallee = nullptr; + endReduceCallee = nullptr; - pushRegionCallee = NULL; - popRegionCallee = NULL; + pushRegionCallee = nullptr; + popRegionCallee = nullptr; - allocateDataCallee = NULL; - deallocateDataCallee = NULL; + allocateDataCallee = nullptr; + deallocateDataCallee = nullptr; + beginDeepCopyCallee = nullptr; + endDeepCopyCallee = nullptr; } } } diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp index f76e5dfa04826f057345fd919f9861c78916a7df..2c2e524d9de40925c131178ace5915b3398e1e7f 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -81,23 +81,11 @@ typedef void (*popFunction)(); typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t); typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t); - -static initFunction initProfileLibrary = NULL; -static finalizeFunction finalizeProfileLibrary = NULL; - -static beginFunction beginForCallee = NULL; -static beginFunction beginScanCallee = NULL; -static beginFunction beginReduceCallee = NULL; -static endFunction endForCallee = NULL; -static endFunction endScanCallee = NULL; -static endFunction endReduceCallee = NULL; - -static pushFunction pushRegionCallee = NULL; -static popFunction popRegionCallee = NULL; - -static allocateDataFunction allocateDataCallee = NULL; -static deallocateDataFunction deallocateDataCallee = NULL; - +typedef void (*beginDeepCopyFunction)( + SpaceHandle, const char*, const void*, + SpaceHandle, const char*, const void*, + uint64_t); +typedef void (*endDeepCopyFunction)(); bool profileLibraryLoaded(); @@ -114,35 +102,14 @@ void popRegion(); void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size); void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size); +void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr, + const SpaceHandle src_space, const std::string src_label, const void* src_ptr, + const uint64_t size); +void endDeepCopy(); + void initialize(); void finalize(); -//Define finalize_fake inline to get rid of warnings for unused static variables -inline void finalize_fake() { - if(NULL != finalizeProfileLibrary) { - (*finalizeProfileLibrary)(); - - // Set all profile hooks to NULL to prevent - // any additional calls. Once we are told to - // finalize, we mean it - beginForCallee = NULL; - beginScanCallee = NULL; - beginReduceCallee = NULL; - endScanCallee = NULL; - endForCallee = NULL; - endReduceCallee = NULL; - - allocateDataCallee = NULL; - deallocateDataCallee = NULL; - - initProfileLibrary = NULL; - finalizeProfileLibrary = NULL; - pushRegionCallee = NULL; - popRegionCallee = NULL; - } -} - - } } diff --git a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ac697fce4b7b49d98aeeb42880025f780b6dc0ba --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp @@ -0,0 +1,208 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Atomic.hpp> +#include <impl/Kokkos_Rendezvous.hpp> +#include <impl/Kokkos_Spinwait.hpp> + +namespace Kokkos { namespace Impl { + +//---------------------------------------------------------------------------- +/* pattern for rendezvous + * + * if ( rendezvous() ) { + * ... all other threads are still in team_rendezvous() ... + * rendezvous_release(); + * ... all other threads are released from team_rendezvous() ... + * } + */ + +int rendezvous( volatile int64_t * const buffer + , int const size + , int const rank + , int const slow + ) noexcept +{ + enum : int { shift_byte = 3 }; + enum : int { size_byte = ( 01 << shift_byte ) }; // == 8 + enum : int { mask_byte = size_byte - 1 }; + + enum : int { shift_mem_cycle = 2 }; + enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4 + enum : int { mask_mem_cycle = size_mem_cycle - 1 }; + + // Cycle step values: 1 <= step <= size_val_cycle + // An odd multiple of memory cycle so that when a memory location + // is reused it has a different value. + // Must be representable within a single byte: size_val_cycle < 16 + + enum : int { size_val_cycle = 3 * size_mem_cycle }; + + // Requires: + // Called by rank = [ 0 .. size ) + // buffer aligned to int64_t[4] + + // A sequence of rendezvous uses four cycled locations in memory + // and non-equal cycled synchronization values to + // 1) prevent rendezvous from overtaking one another and + // 2) give each spin wait location an int64_t[4] span + // so that it has its own cache line. + + const int64_t step = (buffer[0] % size_val_cycle ) + 1 ; + + // The leading int64_t[4] span is for thread 0 to write + // and all other threads to read spin-wait. + // sync_offset is the index into this array for this step. + + const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle + size_mem_cycle ; + + if ( rank ) { + + const int group_begin = rank << shift_byte ; // == rank * size_byte + + if ( group_begin < size ) { + + // This thread waits for threads + // [ group_begin .. group_begin + 8 ) + // [ rank*8 .. rank*8 + 8 ) + // to write to their designated bytes. + + const int end = group_begin + size_byte < size + ? size_byte : size - group_begin ; + + int64_t value = 0; + for ( int i = 0 ; i < end ; ++i ) { + value |= step << (i * size_byte ); + } + + store_fence(); // This should not be needed but fixes #742 + + if ( slow ) { + yield_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ] + , value ); + } + else { + spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ] + , value ); + } + } + + { + // This thread sets its designated byte. + // ( rank % size_byte ) + + // ( ( rank / size_byte ) * size_byte * size_mem_cycle ) + + // ( sync_offset * size_byte ) + const int offset = ( rank & mask_byte ) + + ( ( rank & ~mask_byte ) << shift_mem_cycle ) + + ( sync_offset << shift_byte ); + + // All of this thread's previous memory stores must be complete before + // this thread stores the step value at this thread's designated byte + // in the shared synchronization array. + + Kokkos::memory_fence(); + + ((volatile int8_t*) buffer)[ offset ] = int8_t( step ); + + // Memory fence to push the previous store out + Kokkos::memory_fence(); + } + + // Wait for thread 0 to release all other threads + + if ( slow ) { + yield_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) ); + } + else { + spinwait_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) ); + } + } + else { + // Thread 0 waits for threads [1..7] + // to write to their designated bytes. + + const int end = size_byte < size ? 8 : size ; + + int64_t value = 0; + for ( int i = 1 ; i < end ; ++i ) { + value |= step << (i * size_byte ); + } + + if ( slow ) { + yield_until_equal( buffer[ sync_offset ], value ); + } + else { + spinwait_until_equal( buffer[ sync_offset ], value ); + } + } + + return rank ? 0 : 1 ; +} + +void rendezvous_release( volatile int64_t * const buffer ) noexcept +{ + enum : int { shift_mem_cycle = 2 }; + enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4 + enum : int { mask_mem_cycle = size_mem_cycle - 1 }; + enum : int { size_val_cycle = 3 * size_mem_cycle }; + + // Requires: + // Called after team_rendezvous + // Called only by true == team_rendezvous(root) + + // update step + const int64_t step = (buffer[0] % size_val_cycle ) + 1; + buffer[0] = step; + + // Memory fence to be sure all previous writes are complete: + Kokkos::memory_fence(); + + buffer[ (step & mask_mem_cycle) + size_mem_cycle ] = step; + + // Memory fence to push the store out + Kokkos::memory_fence(); +} + +}} // namespace Kokkos::Impl + diff --git a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp new file mode 100644 index 0000000000000000000000000000000000000000..57f8633bcaca80045b5acb151abadb9aef5abc3c --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp @@ -0,0 +1,87 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_RENDEZVOUS_HPP +#define KOKKOS_IMPL_RENDEZVOUS_HPP + +#include <cstdint> + +namespace Kokkos { namespace Impl { + +inline +constexpr int rendezvous_buffer_size( int max_members ) noexcept +{ + return (((max_members + 7) / 8) * 4) + 4 + 4; +} + +/** \brief Thread pool rendezvous + * + * Rendezvous pattern: + * if ( rendezvous(root) ) { + * ... only root thread here while all others wait ... + * rendezvous_release(); + * } + * else { + * ... all other threads release here ... + * } + * + * Requires: buffer[ rendezvous_buffer_size( max_threads ) ]; + * + * When slow != 0 the expectation is thread arrival will be + * slow so the threads that arrive early should quickly yield + * their core to the runtime thus possibly allowing the late + * arriving threads to have more resources + * (e.g., power and clock frequency). + */ +int rendezvous( volatile int64_t * const buffer + , int const size + , int const rank + , int const slow = 0 ) noexcept ; + +void rendezvous_release( volatile int64_t * const buffer ) noexcept ; + + +}} // namespace Kokkos::Impl + +#endif // KOKKOS_IMPL_RENDEZVOUS_HPP + diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp index 755271c07e4e473448b0800d4395582a85ca49fa..dfbeba461e34ea2878240b97c2941339eb4f5803 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp @@ -50,6 +50,7 @@ #include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> /*--------------------------------------------------------------------------*/ @@ -123,7 +124,6 @@ void serial_resize_thread_team_data( size_t pool_reduce_bytes } } -// Get thread team data structure for omp_get_thread_num() HostThreadTeamData * serial_get_thread_team_data() { return & g_serial_thread_team_data ; @@ -151,6 +151,8 @@ void Serial::initialize( unsigned threads_count (void) use_cores_per_numa; (void) allow_asynchronous_threadpool; + Impl::SharedAllocationRecord< void, void >::tracking_enable(); + // Init the array of locks used for arbitrarily sized atomics Impl::init_lock_array_host_space(); #if defined(KOKKOS_ENABLE_PROFILING) diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp index 76297161b175c705f986e70708ec56279eb28f02..0b6fbd9af0e169520136bea28154dbe1f5166b77 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp @@ -62,7 +62,7 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute { using execution_space = Kokkos::Serial ; using queue_type = TaskQueue< execution_space > ; - using task_root_type = TaskBase< execution_space , void , void > ; + using task_root_type = TaskBase< void , void , void > ; using Member = Impl::HostThreadTeamMember< execution_space > ; task_root_type * const end = (task_root_type *) task_root_type::EndTag ; @@ -122,7 +122,7 @@ void TaskQueueSpecialization< Kokkos::Serial > :: { using execution_space = Kokkos::Serial ; using queue_type = TaskQueue< execution_space > ; - using task_root_type = TaskBase< execution_space , void , void > ; + using task_root_type = TaskBase< void , void , void > ; using Member = Impl::HostThreadTeamMember< execution_space > ; task_root_type * const end = (task_root_type *) task_root_type::EndTag ; diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp index 2eb2b5cf529958b2fd10a8b0677e0cb40da40676..39deebbbf1d513248d3ad028ad809ee32c1fa95e 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp @@ -65,7 +65,7 @@ public: using execution_space = Kokkos::Serial ; using memory_space = Kokkos::HostSpace ; using queue_type = Kokkos::Impl::TaskQueue< execution_space > ; - using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ; + using task_base_type = Kokkos::Impl::TaskBase< void , void , void > ; using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ; static diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..dc30ffe9e0496f6a4dceac2a93e377659d5c5cd4 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp @@ -0,0 +1,102 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP +#define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ... Traits > +class ParallelFor< FunctorType , + Kokkos::Experimental::WorkGraphPolicy< Traits ... > , + Kokkos::Serial + > + : public Kokkos::Impl::Experimental:: + WorkGraphExec< FunctorType, + Kokkos::Serial, + Traits ... + > +{ +private: + + typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ; + typedef Kokkos::Impl::Experimental:: + WorkGraphExec<FunctorType, Kokkos::Serial, Traits ... > Base ; + + template< class TagType > + typename std::enable_if< std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + Base::m_functor( i ); + } + + template< class TagType > + typename std::enable_if< ! std::is_same< TagType , void >::value >::type + exec_one(const typename Policy::member_type& i) const { + const TagType t{} ; + Base::m_functor( t , i ); + } + +public: + + inline + void execute() + { + for (std::int32_t i; (-1 != (i = Base::before_work())); ) { + exec_one< typename Policy::work_tag >( i ); + Base::after_work(i); + } + } + + inline + ParallelFor( const FunctorType & arg_functor + , const Policy & arg_policy ) + : Base( arg_functor, arg_policy ) + { + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp index e28c1194a7984b73da38fd7fc952e63e0c7574c9..af79523e0cbdff938983692c019613916d6c45d5 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp @@ -46,23 +46,23 @@ namespace Kokkos { namespace Impl { -int SharedAllocationRecord< void , void >::s_tracking_enabled = 1 ; +namespace { -void SharedAllocationRecord< void , void >::tracking_claim_and_disable() -{ - // A host thread claim and disable tracking flag +__thread int t_tracking_enabled = 1; - while ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 1, 0 ) ); } -void SharedAllocationRecord< void , void >::tracking_release_and_enable() +int SharedAllocationRecord< void , void >::tracking_enabled() +{ return t_tracking_enabled; } + +void SharedAllocationRecord< void , void >::tracking_disable() { - // The host thread that claimed and disabled the tracking flag - // now release and enable tracking. + t_tracking_enabled = 0; +} - if ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 0, 1 ) ){ - Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord<>::tracking_release_and_enable FAILED, this host process thread did not hold the lock" ); - } +void SharedAllocationRecord< void , void >::tracking_enable() +{ + t_tracking_enabled = 1; } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp index 4dc61bb02e027d82fd94022e636778c2a17e24ed..2e3cc1a163ed1c1e60db53b827a0e0f2588816e1 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp @@ -71,6 +71,9 @@ public: KOKKOS_INLINE_FUNCTION static const SharedAllocationHeader * get_header( void * alloc_ptr ) { return reinterpret_cast<SharedAllocationHeader*>( reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader) ); } + + KOKKOS_INLINE_FUNCTION + const char* label() const { return m_label; } }; template<> @@ -83,8 +86,6 @@ protected: typedef void (* function_type )( SharedAllocationRecord<void,void> * ); - static int s_tracking_enabled ; - SharedAllocationHeader * const m_alloc_ptr ; size_t const m_alloc_size ; function_type const m_dealloc ; @@ -110,17 +111,17 @@ protected: public: inline std::string get_label() const { return std::string("Unmanaged"); } - static int tracking_enabled() { return s_tracking_enabled ; } + static int tracking_enabled(); /**\brief A host process thread claims and disables the * shared allocation tracking flag. */ - static void tracking_claim_and_disable(); + static void tracking_disable(); /**\brief A host process thread releases and enables the * shared allocation tracking flag. */ - static void tracking_release_and_enable(); + static void tracking_enable(); ~SharedAllocationRecord() = default ; @@ -317,6 +318,11 @@ public: #endif } + KOKKOS_INLINE_FUNCTION + bool has_record() const { + return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0; + } + KOKKOS_FORCEINLINE_FUNCTION ~SharedAllocationTracker() { KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT } diff --git a/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3d3f83ed857a958cc3519008ced28fe6bd0198bf --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp @@ -0,0 +1,210 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + +#include <Kokkos_Atomic.hpp> +#include <impl/Kokkos_Spinwait.hpp> +#include <impl/Kokkos_BitOps.hpp> + +#if defined( KOKKOS_ENABLE_STDTHREAD ) + #include <thread> +#elif !defined( _WIN32 ) + #include <sched.h> + #include <time.h> +#else + #include <process.h> + #include <winsock2.h> + #include <windows.h> +#endif + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +namespace { + +void host_thread_yield( const uint32_t i , const int force_yield ) +{ + static constexpr uint32_t sleep_limit = 1 << 13 ; + static constexpr uint32_t yield_limit = 1 << 12 ; + + const int c = Kokkos::Impl::bit_scan_reverse(i); + + if ( sleep_limit < i ) { + + // Attempt to put the thread to sleep for 'c' milliseconds + + #if defined( KOKKOS_ENABLE_STDTHREAD ) + std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) ) + #elif !defined( _WIN32 ) + timespec req ; + req.tv_sec = 0 ; + req.tv_nsec = 1000 * c ; + nanosleep( &req, nullptr ); + #else /* defined( _WIN32 ) IS Microsoft Windows */ + Sleep(c); + #endif + } + + else if ( force_yield || yield_limit < i ) { + + // Attempt to yield thread resources to runtime + + #if defined( KOKKOS_ENABLE_STDTHREAD ) + std::this_thread::yield(); + #elif !defined( _WIN32 ) + sched_yield(); + #else /* defined( _WIN32 ) IS Microsoft Windows */ + YieldProcessor(); + #endif + } + + #if defined( KOKKOS_ENABLE_ASM ) + + else if ( (1u<<4) < i ) { + + // Insert a few no-ops to quiet the thread: + + for ( int k = 0 ; k < c ; ++k ) { + #if defined( __amd64 ) || defined( __amd64__ ) || \ + defined( __x86_64 ) || defined( __x86_64__ ) + #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */ + asm volatile( "nop\n" ); + #else + __asm__ __volatile__( "nop\n" ); + #endif + #elif defined(__PPC64__) + asm volatile( "nop\n" ); + #endif + } + } + + { + // Insert memory pause + #if defined( __amd64 ) || defined( __amd64__ ) || \ + defined( __x86_64 ) || defined( __x86_64__ ) + #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */ + asm volatile( "pause\n":::"memory" ); + #else + __asm__ __volatile__( "pause\n":::"memory" ); + #endif + #elif defined(__PPC64__) + asm volatile( "or 27, 27, 27" ::: "memory" ); + #endif + } + + #endif /* defined( KOKKOS_ENABLE_ASM ) */ +} + +}}} // namespace Kokkos::Impl::{anonymous} + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +void spinwait_while_equal( volatile int32_t & flag , const int32_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0); + Kokkos::load_fence(); +} + +void spinwait_until_equal( volatile int32_t & flag , const int32_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0); + Kokkos::load_fence(); +} + +void spinwait_while_equal( volatile int64_t & flag , const int64_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0); + Kokkos::load_fence(); +} + +void spinwait_until_equal( volatile int64_t & flag , const int64_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0); + Kokkos::load_fence(); +} + +void yield_while_equal( volatile int32_t & flag , const int32_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1); + Kokkos::load_fence(); +} + +void yield_until_equal( volatile int32_t & flag , const int32_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1); + Kokkos::load_fence(); +} + +void yield_while_equal( volatile int64_t & flag , const int64_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1); + Kokkos::load_fence(); +} + +void yield_until_equal( volatile int64_t & flag , const int64_t value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1); + Kokkos::load_fence(); +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +#else +void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {} +#endif + diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp similarity index 82% rename from lib/kokkos/core/src/impl/Kokkos_spinwait.hpp rename to lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp index 6e34b8a943d164eea1af317be66928a26a9e4ab2..b49e308566ff14fe4daf78a7271415be122ac7d7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp @@ -59,6 +59,13 @@ void spinwait_until_equal( volatile int32_t & flag , const int32_t value ); void spinwait_while_equal( volatile int64_t & flag , const int64_t value ); void spinwait_until_equal( volatile int64_t & flag , const int64_t value ); + +void yield_while_equal( volatile int32_t & flag , const int32_t value ); +void yield_until_equal( volatile int32_t & flag , const int32_t value ); + +void yield_while_equal( volatile int64_t & flag , const int64_t value ); +void yield_until_equal( volatile int64_t & flag , const int64_t value ); + #else KOKKOS_INLINE_FUNCTION @@ -71,6 +78,16 @@ void spinwait_while_equal( volatile int64_t & , const int64_t ) {} KOKKOS_INLINE_FUNCTION void spinwait_until_equal( volatile int64_t & , const int64_t ) {} +KOKKOS_INLINE_FUNCTION +void yield_while_equal( volatile int32_t & , const int32_t ) {} +KOKKOS_INLINE_FUNCTION +void yield_until_equal( volatile int32_t & , const int32_t ) {} + +KOKKOS_INLINE_FUNCTION +void yield_while_equal( volatile int64_t & , const int64_t ) {} +KOKKOS_INLINE_FUNCTION +void yield_until_equal( volatile int64_t & , const int64_t ) {} + #endif } /* namespace Impl */ diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp index bee98e6745e0be66c3afe6b7d308abd08b42bef9..5f8699302d7d040266c7c4596ccb773305e26f49 100644 --- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp @@ -59,23 +59,223 @@ namespace Kokkos { namespace Impl { -/*\brief Implementation data for task data management, access, and execution. +template< class Space , typename ResultType , class FunctorType > +class TaskBase ; + +template< typename Space > +class TaskQueue ; + +template< typename Space > +class TaskQueueSpecialization ; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief Base class for task management, access, and execution. * - * Curiously recurring template pattern (CRTP) - * to allow static_cast from the - * task root type and a task's FunctorType. + * Inheritance structure to allow static_cast from the task root type + * and a task's FunctorType. * + * // Enable a functor to access the base class + * // and provide memory for result value. * TaskBase< Space , ResultType , FunctorType > - * : TaskBase< Space , ResultType , void > + * : TaskBase< void , void , void > * , FunctorType * { ... }; + * Followed by memory allocated for result value. + * + * + * States of a task: + * + * Constructing State, NOT IN a linked list + * m_wait == 0 + * m_next == 0 + * + * Scheduling transition : Constructing -> Waiting + * before: + * m_wait == 0 + * m_next == this task's initial dependence, 0 if none + * after: + * m_wait == EndTag + * m_next == EndTag + * + * Waiting State, IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == next of linked list of tasks + * + * transition : Waiting -> Executing + * before: + * m_next == EndTag + * after:: + * m_next == LockTag + * + * Executing State, NOT IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == LockTag + * + * Respawn transition : Executing -> Executing-Respawn + * before: + * m_next == LockTag + * after: + * m_next == this task's updated dependence, 0 if none + * + * Executing-Respawn State, NOT IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == this task's updated dependence, 0 if none + * + * transition : Executing -> Complete + * before: + * m_wait == head of linked list + * after: + * m_wait == LockTag + * + * Complete State, NOT IN a linked list + * m_wait == LockTag: cannot add dependence (<=> complete) + * m_next == LockTag: not a member of a wait queue * - * TaskBase< Space , ResultType , void > - * : TaskBase< Space , void , void > - * { ... }; */ -template< typename Space , typename ResultType , typename FunctorType > -class TaskBase ; +template<> +class TaskBase< void , void , void > +{ +public: + + enum : int16_t { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 }; + enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) }; + + template< typename > friend class Kokkos::TaskScheduler ; + + typedef TaskQueue< void > queue_type ; + + typedef void (* function_type) ( TaskBase * , void * ); + + // sizeof(TaskBase) == 48 + + function_type m_apply ; ///< Apply function pointer + queue_type * m_queue ; ///< Pointer to queue + TaskBase * m_wait ; ///< Linked list of tasks waiting on this + TaskBase * m_next ; ///< Waiting linked-list next + int32_t m_ref_count ; ///< Reference count + int32_t m_alloc_size ; ///< Allocation size + int32_t m_dep_count ; ///< Aggregate's number of dependences + int16_t m_task_type ; ///< Type of task + int16_t m_priority ; ///< Priority of runnable task + + TaskBase( TaskBase && ) = delete ; + TaskBase( const TaskBase & ) = delete ; + TaskBase & operator = ( TaskBase && ) = delete ; + TaskBase & operator = ( const TaskBase & ) = delete ; + + KOKKOS_INLINE_FUNCTION ~TaskBase() = default ; + + KOKKOS_INLINE_FUNCTION constexpr + TaskBase() + : m_apply( 0 ) + , m_queue( 0 ) + , m_wait( 0 ) + , m_next( 0 ) + , m_ref_count( 0 ) + , m_alloc_size( 0 ) + , m_dep_count( 0 ) + , m_task_type( 0 ) + , m_priority( 0 ) + {} + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + TaskBase * volatile * aggregate_dependences() volatile + { return reinterpret_cast<TaskBase*volatile*>( this + 1 ); } + + KOKKOS_INLINE_FUNCTION + bool requested_respawn() + { + // This should only be called when a task has finished executing and is + // in the transition to either the complete or executing-respawn state. + TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag ); + return lock != m_next; + } + + KOKKOS_INLINE_FUNCTION + void add_dependence( TaskBase* dep ) + { + // Precondition: lock == m_next + + TaskBase * const lock = (TaskBase *) LockTag ; + + // Assign dependence to m_next. It will be processed in the subsequent + // call to schedule. Error if the dependence is reset. + if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) { + Kokkos::abort("TaskScheduler ERROR: resetting task dependence"); + } + + if ( 0 != dep ) { + // The future may be destroyed upon returning from this call + // so increment reference count to track this assignment. + Kokkos::atomic_increment( &(dep->m_ref_count) ); + } + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + int32_t reference_count() const + { return *((int32_t volatile *)( & m_ref_count )); } + +}; + +static_assert( sizeof(TaskBase<void,void,void>) == 48 + , "Verifying expected sizeof(TaskBase<void,void,void>)" ); + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template< typename ResultType > +struct TaskResult { + + enum : int32_t { size = sizeof(ResultType) }; + + using reference_type = ResultType & ; + + KOKKOS_INLINE_FUNCTION static + ResultType * ptr( TaskBase<void,void,void> * task ) + { + return reinterpret_cast< ResultType * > + ( reinterpret_cast< char * >(task) + task->m_alloc_size - sizeof(ResultType) ); + } + + KOKKOS_INLINE_FUNCTION static + reference_type get( TaskBase<void,void,void> * task ) + { return *ptr( task ); } +}; + +template<> +struct TaskResult< void > { + + enum : int32_t { size = 0 }; + + using reference_type = void ; + + KOKKOS_INLINE_FUNCTION static + void * ptr( TaskBase<void,void,void> * ) { return (void*) 0 ; } + + KOKKOS_INLINE_FUNCTION static + reference_type get( TaskBase<void,void,void> * ) {} +}; } /* namespace Impl */ } /* namespace Kokkos */ @@ -86,8 +286,8 @@ class TaskBase ; namespace Kokkos { namespace Impl { -template< typename Space > -class TaskQueueSpecialization ; +template<> +class TaskQueue< void > {}; /** \brief Manage task allocation, deallocation, and scheduling. * @@ -95,7 +295,7 @@ class TaskQueueSpecialization ; * All other aspects of task management have shared implementation. */ template< typename ExecSpace > -class TaskQueue { +class TaskQueue : public TaskQueue<void> { private: friend class TaskQueueSpecialization< ExecSpace > ; @@ -106,7 +306,7 @@ private: using memory_space = typename specialization::memory_space ; using device_type = Kokkos::Device< execution_space , memory_space > ; using memory_pool = Kokkos::MemoryPool< device_type > ; - using task_root_type = Kokkos::Impl::TaskBase<execution_space,void,void> ; + using task_root_type = Kokkos::Impl::TaskBase<void,void,void> ; struct Destroy { TaskQueue * m_queue ; @@ -198,12 +398,10 @@ public: } // Assign task pointer with reference counting of assigned tasks - template< typename LV , typename RV > KOKKOS_FUNCTION static - void assign( TaskBase< execution_space,LV,void> ** const lhs - , TaskBase< execution_space,RV,void> * const rhs ) + void assign( task_root_type ** const lhs + , task_root_type * const rhs ) { - using task_lhs = TaskBase< execution_space,LV,void> ; #if 0 { printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n" @@ -225,7 +423,7 @@ public: // Force write of *lhs - *static_cast< task_lhs * volatile * >(lhs) = rhs ; + *static_cast< task_root_type * volatile * >(lhs) = rhs ; Kokkos::memory_fence(); } @@ -238,272 +436,52 @@ public: KOKKOS_FUNCTION void deallocate( void * p , size_t n ); ///< Deallocate to the memory pool -}; - -} /* namespace Impl */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { -template<> -class TaskBase< void , void , void > { -public: - enum : int16_t { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 }; - enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) }; -}; - -/** \brief Base class for task management, access, and execution. - * - * Inheritance structure to allow static_cast from the task root type - * and a task's FunctorType. - * - * // Enable a Future to access result data - * TaskBase< Space , ResultType , void > - * : TaskBase< void , void , void > - * { ... }; - * - * // Enable a functor to access the base class - * TaskBase< Space , ResultType , FunctorType > - * : TaskBase< Space , ResultType , void > - * , FunctorType - * { ... }; - * - * - * States of a task: - * - * Constructing State, NOT IN a linked list - * m_wait == 0 - * m_next == 0 - * - * Scheduling transition : Constructing -> Waiting - * before: - * m_wait == 0 - * m_next == this task's initial dependence, 0 if none - * after: - * m_wait == EndTag - * m_next == EndTag - * - * Waiting State, IN a linked list - * m_apply != 0 - * m_queue != 0 - * m_ref_count > 0 - * m_wait == head of linked list of tasks waiting on this task - * m_next == next of linked list of tasks - * - * transition : Waiting -> Executing - * before: - * m_next == EndTag - * after:: - * m_next == LockTag - * - * Executing State, NOT IN a linked list - * m_apply != 0 - * m_queue != 0 - * m_ref_count > 0 - * m_wait == head of linked list of tasks waiting on this task - * m_next == LockTag - * - * Respawn transition : Executing -> Executing-Respawn - * before: - * m_next == LockTag - * after: - * m_next == this task's updated dependence, 0 if none - * - * Executing-Respawn State, NOT IN a linked list - * m_apply != 0 - * m_queue != 0 - * m_ref_count > 0 - * m_wait == head of linked list of tasks waiting on this task - * m_next == this task's updated dependence, 0 if none - * - * transition : Executing -> Complete - * before: - * m_wait == head of linked list - * after: - * m_wait == LockTag - * - * Complete State, NOT IN a linked list - * m_wait == LockTag: cannot add dependence - * m_next == LockTag: not a member of a wait queue - * - */ -template< typename ExecSpace > -class TaskBase< ExecSpace , void , void > -{ -public: - - enum : int16_t { TaskTeam = TaskBase<void,void,void>::TaskTeam - , TaskSingle = TaskBase<void,void,void>::TaskSingle - , Aggregate = TaskBase<void,void,void>::Aggregate }; - - enum : uintptr_t { LockTag = TaskBase<void,void,void>::LockTag - , EndTag = TaskBase<void,void,void>::EndTag }; - - using execution_space = ExecSpace ; - using queue_type = TaskQueue< execution_space > ; - - template< typename > friend class Kokkos::TaskScheduler ; - - typedef void (* function_type) ( TaskBase * , void * ); - - // sizeof(TaskBase) == 48 - - function_type m_apply ; ///< Apply function pointer - queue_type * m_queue ; ///< Queue in which this task resides - TaskBase * m_wait ; ///< Linked list of tasks waiting on this - TaskBase * m_next ; ///< Waiting linked-list next - int32_t m_ref_count ; ///< Reference count - int32_t m_alloc_size ; ///< Allocation size - int32_t m_dep_count ; ///< Aggregate's number of dependences - int16_t m_task_type ; ///< Type of task - int16_t m_priority ; ///< Priority of runnable task - - TaskBase() = delete ; - TaskBase( TaskBase && ) = delete ; - TaskBase( const TaskBase & ) = delete ; - TaskBase & operator = ( TaskBase && ) = delete ; - TaskBase & operator = ( const TaskBase & ) = delete ; - - KOKKOS_INLINE_FUNCTION ~TaskBase() = default ; - - // Constructor for a runnable task - KOKKOS_INLINE_FUNCTION - constexpr TaskBase( function_type arg_apply - , queue_type * arg_queue - , TaskBase * arg_dependence - , int arg_ref_count - , int arg_alloc_size - , int arg_task_type - , int arg_priority - ) noexcept - : m_apply( arg_apply ) - , m_queue( arg_queue ) - , m_wait( 0 ) - , m_next( arg_dependence ) - , m_ref_count( arg_ref_count ) - , m_alloc_size( arg_alloc_size ) - , m_dep_count( 0 ) - , m_task_type( arg_task_type ) - , m_priority( arg_priority ) - {} - - // Constructor for an aggregate task - KOKKOS_INLINE_FUNCTION - constexpr TaskBase( queue_type * arg_queue - , int arg_ref_count - , int arg_alloc_size - , int arg_dep_count - ) noexcept - : m_apply( 0 ) - , m_queue( arg_queue ) - , m_wait( 0 ) - , m_next( 0 ) - , m_ref_count( arg_ref_count ) - , m_alloc_size( arg_alloc_size ) - , m_dep_count( arg_dep_count ) - , m_task_type( Aggregate ) - , m_priority( 0 ) - {} //---------------------------------------- + /**\brief Allocation size for a spawned task */ - KOKKOS_INLINE_FUNCTION - TaskBase ** aggregate_dependences() - { return reinterpret_cast<TaskBase**>( this + 1 ); } - - KOKKOS_INLINE_FUNCTION - bool requested_respawn() + template< typename FunctorType > + KOKKOS_FUNCTION + size_t spawn_allocation_size() const { - // This should only be called when a task has finished executing and is - // in the transition to either the complete or executing-respawn state. - TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag ); - return lock != m_next; - } + using value_type = typename FunctorType::value_type ; - KOKKOS_INLINE_FUNCTION - void add_dependence( TaskBase* dep ) - { - // Precondition: lock == m_next + using task_type = Impl::TaskBase< execution_space + , value_type + , FunctorType > ; - TaskBase * const lock = (TaskBase *) LockTag ; + enum : size_t { align = ( 1 << 4 ) , align_mask = align - 1 }; + enum : size_t { task_size = sizeof(task_type) }; + enum : size_t { result_size = Impl::TaskResult< value_type >::size }; + enum : size_t { alloc_size = + ( ( task_size + align_mask ) & ~align_mask ) + + ( ( result_size + align_mask ) & ~align_mask ) }; - // Assign dependence to m_next. It will be processed in the subsequent - // call to schedule. Error if the dependence is reset. - if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) { - Kokkos::abort("TaskScheduler ERROR: resetting task dependence"); - } - - if ( 0 != dep ) { - // The future may be destroyed upon returning from this call - // so increment reference count to track this assignment. - Kokkos::atomic_increment( &(dep->m_ref_count) ); - } + return m_memory.allocate_block_size( task_size ); } - using get_return_type = void ; + /**\brief Allocation size for a when_all aggregate */ - KOKKOS_INLINE_FUNCTION - get_return_type get() const {} + KOKKOS_FUNCTION + size_t when_all_allocation_size( int narg ) const + { + return m_memory.allocate_block_size( sizeof(task_root_type) + narg * sizeof(task_root_type*) ); + } }; -template < typename ExecSpace , typename ResultType > -class TaskBase< ExecSpace , ResultType , void > - : public TaskBase< ExecSpace , void , void > -{ -private: - - using root_type = TaskBase<ExecSpace,void,void> ; - using function_type = typename root_type::function_type ; - using queue_type = typename root_type::queue_type ; - - static_assert( sizeof(root_type) == 48 , "" ); - - TaskBase() = delete ; - TaskBase( TaskBase && ) = delete ; - TaskBase( const TaskBase & ) = delete ; - TaskBase & operator = ( TaskBase && ) = delete ; - TaskBase & operator = ( const TaskBase & ) = delete ; - -public: - - ResultType m_result ; - - KOKKOS_INLINE_FUNCTION ~TaskBase() = default ; - - // Constructor for runnable task - KOKKOS_INLINE_FUNCTION - constexpr TaskBase( function_type arg_apply - , queue_type * arg_queue - , root_type * arg_dependence - , int arg_ref_count - , int arg_alloc_size - , int arg_task_type - , int arg_priority - ) - : root_type( arg_apply - , arg_queue - , arg_dependence - , arg_ref_count - , arg_alloc_size - , arg_task_type - , arg_priority - ) - , m_result() - {} +} /* namespace Impl */ +} /* namespace Kokkos */ - using get_return_type = ResultType const & ; +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- - KOKKOS_INLINE_FUNCTION - get_return_type get() const { return m_result ; } -}; +namespace Kokkos { +namespace Impl { -template< typename ExecSpace , typename ResultType , typename FunctorType > +template< class ExecSpace , typename ResultType , class FunctorType > class TaskBase - : public TaskBase< ExecSpace , ResultType , void > + : public TaskBase< void , void , void > , public FunctorType { private: @@ -516,50 +494,31 @@ private: public: - using root_type = TaskBase< ExecSpace , void , void > ; - using base_type = TaskBase< ExecSpace , ResultType , void > ; - using specialization = TaskQueueSpecialization< ExecSpace > ; - using function_type = typename root_type::function_type ; - using queue_type = typename root_type::queue_type ; - using member_type = typename specialization::member_type ; + using root_type = TaskBase< void , void , void > ; using functor_type = FunctorType ; using result_type = ResultType ; - template< typename Type > - KOKKOS_INLINE_FUNCTION static - void apply_functor - ( Type * const task - , typename std::enable_if - < std::is_same< typename Type::result_type , void >::value - , member_type * const - >::type member - ) - { - using fType = typename Type::functor_type ; - static_cast<fType*>(task)->operator()( *member ); - } + using specialization = TaskQueueSpecialization< ExecSpace > ; + using member_type = typename specialization::member_type ; - template< typename Type > - KOKKOS_INLINE_FUNCTION static - void apply_functor - ( Type * const task - , typename std::enable_if - < ! std::is_same< typename Type::result_type , void >::value - , member_type * const - >::type member - ) - { - using fType = typename Type::functor_type ; - static_cast<fType*>(task)->operator()( *member , task->m_result ); - } + KOKKOS_INLINE_FUNCTION + void apply_functor( member_type * const member , void * ) + { functor_type::operator()( *member ); } + + template< typename T > + KOKKOS_INLINE_FUNCTION + void apply_functor( member_type * const member + , T * const result ) + { functor_type::operator()( *member , *result ); } KOKKOS_FUNCTION static void apply( root_type * root , void * exec ) { TaskBase * const task = static_cast< TaskBase * >( root ); member_type * const member = reinterpret_cast< member_type * >( exec ); + result_type * const result = TaskResult< result_type >::ptr( task ); - TaskBase::template apply_functor( task , member ); + task->apply_functor( member , result ); // Task may be serial or team. // If team then must synchronize before querying if respawn was requested. @@ -576,26 +535,9 @@ public: } // Constructor for runnable task - KOKKOS_INLINE_FUNCTION - constexpr TaskBase( function_type arg_apply - , queue_type * arg_queue - , root_type * arg_dependence - , int arg_ref_count - , int arg_alloc_size - , int arg_task_type - , int arg_priority - , FunctorType && arg_functor - ) - : base_type( arg_apply - , arg_queue - , arg_dependence - , arg_ref_count - , arg_alloc_size - , arg_task_type - , arg_priority - ) - , functor_type( arg_functor ) - {} + KOKKOS_INLINE_FUNCTION constexpr + TaskBase( FunctorType && arg_functor ) + : root_type() , functor_type( std::move(arg_functor) ) {} KOKKOS_INLINE_FUNCTION ~TaskBase() {} diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp index aee381afad69cfbdf9e2590b601bc188484d2215..1974f7e1cae62eca6a1f692476786623f4993800 100644 --- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp @@ -44,6 +44,8 @@ #include <Kokkos_Macros.hpp> #if defined( KOKKOS_ENABLE_TASKDAG ) +#define KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING 0 + namespace Kokkos { namespace Impl { @@ -100,9 +102,11 @@ KOKKOS_FUNCTION void TaskQueue< ExecSpace >::decrement ( TaskQueue< ExecSpace >::task_root_type * task ) { - const int count = Kokkos::atomic_fetch_add(&(task->m_ref_count),-1); + task_root_type volatile & t = *task ; + + const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count),-1); -#if 0 +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING if ( 1 == count ) { printf( "decrement-destroy( 0x%lx { 0x%lx %d %d } )\n" , uintptr_t( task ) @@ -114,9 +118,13 @@ void TaskQueue< ExecSpace >::decrement #endif if ( ( 1 == count ) && - ( task->m_next == (task_root_type *) task_root_type::LockTag ) ) { + ( t.m_next == (task_root_type *) task_root_type::LockTag ) ) { // Reference count is zero and task is complete, deallocate. - task->m_queue->deallocate( task , task->m_alloc_size ); + + TaskQueue< ExecSpace > * const queue = + static_cast< TaskQueue< ExecSpace > * >( t.m_queue ); + + queue->deallocate( task , t.m_alloc_size ); } else if ( count <= 1 ) { Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" ); @@ -171,7 +179,7 @@ bool TaskQueue< ExecSpace >::push_task // Fail the push attempt if the queue is locked; // otherwise retry until the push succeeds. -#if 0 +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING printf( "push_task( 0x%lx { 0x%lx } 0x%lx { 0x%lx 0x%lx %d %d %d } )\n" , uintptr_t(queue) , uintptr_t(*queue) @@ -186,9 +194,9 @@ bool TaskQueue< ExecSpace >::push_task task_root_type * const zero = (task_root_type *) 0 ; task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; - task_root_type * volatile * const next = & task->m_next ; + task_root_type * volatile & next = task->m_next ; - if ( zero != *next ) { + if ( zero != next ) { Kokkos::abort("TaskQueue::push_task ERROR: already a member of another queue" ); } @@ -196,9 +204,9 @@ bool TaskQueue< ExecSpace >::push_task while ( lock != y ) { - *next = y ; + next = y ; - // Do not proceed until '*next' has been stored. + // Do not proceed until 'next' has been stored. Kokkos::memory_fence(); task_root_type * const x = y ; @@ -211,9 +219,9 @@ bool TaskQueue< ExecSpace >::push_task // Failed, replace 'task->m_next' value since 'task' remains // not a member of a queue. - *next = zero ; + next = zero ; - // Do not proceed until '*next' has been stored. + // Do not proceed until 'next' has been stored. Kokkos::memory_fence(); return false ; @@ -270,11 +278,13 @@ TaskQueue< ExecSpace >::pop_ready_task // This thread has exclusive access to // the queue and the popped task's m_next. - *queue = task->m_next ; task->m_next = lock ; + task_root_type * volatile & next = task->m_next ; + + *queue = next ; next = lock ; Kokkos::memory_fence(); -#if 0 +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n" , uintptr_t(queue) , uintptr_t(task) @@ -323,7 +333,7 @@ void TaskQueue< ExecSpace >::schedule_runnable // task->m_wait == head of linked list (queue) // task->m_next == member of linked list (queue) -#if 0 +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n" , uintptr_t(task) , uintptr_t(task->m_wait) @@ -337,20 +347,22 @@ void TaskQueue< ExecSpace >::schedule_runnable task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + task_root_type volatile & t = *task ; + bool respawn = false ; //---------------------------------------- - if ( zero == task->m_wait ) { + if ( zero == t.m_wait ) { // Task in Constructing state // - Transition to Waiting state // Preconditions: // - call occurs exclusively within a single thread - task->m_wait = end ; + t.m_wait = end ; // Task in Waiting state } - else if ( lock != task->m_wait ) { + else if ( lock != t.m_wait ) { // Task in Executing state with Respawn request // - Update dependence // - Transition to Waiting state @@ -373,7 +385,9 @@ void TaskQueue< ExecSpace >::schedule_runnable // Exclusive access so don't need an atomic exchange // task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero ); - task_root_type * dep = task->m_next ; task->m_next = zero ; + task_root_type * dep = t.m_next ; t.m_next = zero ; + + Kokkos::memory_fence(); const bool is_ready = ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) ); @@ -398,7 +412,7 @@ void TaskQueue< ExecSpace >::schedule_runnable Kokkos::atomic_increment( & m_ready_count ); task_root_type * volatile * const ready_queue = - & m_ready[ task->m_priority ][ task->m_task_type ]; + & m_ready[ t.m_priority ][ t.m_task_type ]; // A push_task fails if the ready queue is locked. // A ready queue is only locked during a push or pop; @@ -441,7 +455,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate // task->m_wait == head of linked list (queue) // task->m_next == member of linked list (queue) -#if 0 +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n" , uintptr_t(task) , uintptr_t(task->m_wait) @@ -455,18 +469,20 @@ void TaskQueue< ExecSpace >::schedule_aggregate task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; task_root_type * const end = (task_root_type *) task_root_type::EndTag ; + task_root_type volatile & t = *task ; + //---------------------------------------- - if ( zero == task->m_wait ) { + if ( zero == t.m_wait ) { // Task in Constructing state // - Transition to Waiting state // Preconditions: // - call occurs exclusively within a single thread - task->m_wait = end ; + t.m_wait = end ; // Task in Waiting state } - else if ( lock == task->m_wait ) { + else if ( lock == t.m_wait ) { // Task in Complete state Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete"); } @@ -477,14 +493,14 @@ void TaskQueue< ExecSpace >::schedule_aggregate // (1) created or // (2) being removed from a completed task's wait list. - task_root_type ** const aggr = task->aggregate_dependences(); + task_root_type * volatile * const aggr = t.aggregate_dependences(); // Assume the 'when_all' is complete until a dependence is // found that is not complete. bool is_complete = true ; - for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) { + for ( int i = t.m_dep_count ; 0 < i && is_complete ; ) { --i ; @@ -523,7 +539,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate // Complete the when_all 'task' to schedule other tasks // that are waiting for the when_all 'task' to complete. - task->m_next = lock ; + t.m_next = lock ; complete( task ); @@ -573,7 +589,7 @@ void TaskQueue< ExecSpace >::complete task_root_type * const lock = (task_root_type *) task_root_type::LockTag ; task_root_type * const end = (task_root_type *) task_root_type::EndTag ; -#if 0 +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n" , uintptr_t(task) , uintptr_t(task->m_wait) @@ -584,11 +600,13 @@ void TaskQueue< ExecSpace >::complete fflush( stdout ); #endif - const bool runnable = task_root_type::Aggregate != task->m_task_type ; + task_root_type volatile & t = *task ; + + const bool runnable = task_root_type::Aggregate != t.m_task_type ; //---------------------------------------- - if ( runnable && lock != task->m_next ) { + if ( runnable && lock != t.m_next ) { // Is a runnable task has finished executing and requested respawn. // Schedule the task for subsequent execution. @@ -607,7 +625,7 @@ void TaskQueue< ExecSpace >::complete // Stop other tasks from adding themselves to this task's wait queue // by locking the head of this task's wait queue. - task_root_type * x = Kokkos::atomic_exchange( & task->m_wait , lock ); + task_root_type * x = Kokkos::atomic_exchange( & t.m_wait , lock ); if ( x != (task_root_type *) lock ) { @@ -627,9 +645,13 @@ void TaskQueue< ExecSpace >::complete // Have exclusive access to 'x' until it is scheduled // Set x->m_next = zero <= no dependence, not a respawn - task_root_type * const next = x->m_next ; x->m_next = 0 ; + task_root_type volatile & vx = *x ; + + task_root_type * const next = vx.m_next ; vx.m_next = 0 ; + + Kokkos::memory_fence(); - if ( task_root_type::Aggregate != x->m_task_type ) { + if ( task_root_type::Aggregate != vx.m_task_type ) { schedule_runnable( x ); } else { diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp index c55636b64ea8331ae2a2d66fc2479b727cbf5115..ed1a71bea7ab49ef8813beba5fb7e6a88b6588fe 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -47,7 +47,6 @@ #include <Kokkos_Array.hpp> namespace Kokkos { -namespace Experimental { namespace Impl { template< class DataType , class ArrayLayout , class V , size_t N , class P > @@ -94,13 +93,12 @@ public: typedef typename ViewDataType< non_const_scalar_type , array_scalar_dimension >::type non_const_scalar_array_type ; }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { /** \brief View mapping for non-specialized data type and standard layout */ @@ -597,7 +595,7 @@ public: } }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp index 6381aee468c9ee114c5c050e20565c2a8e52b127..f32c6bb2eedca055d81db61c74ab47e9e23e34f7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp @@ -96,6 +96,27 @@ struct is_view_label< const char[N] > : public std::true_type {}; template< typename ... P > struct ViewCtorProp ; +// Forward declare +template< typename Specialize , typename T > +struct CommonViewAllocProp ; + +/* Common value_type stored as ViewCtorProp + */ +template< typename Specialize , typename T > +struct ViewCtorProp< void , CommonViewAllocProp<Specialize,T> > +{ + ViewCtorProp() = default ; + ViewCtorProp( const ViewCtorProp & ) = default ; + ViewCtorProp & operator = ( const ViewCtorProp & ) = default ; + + using type = CommonViewAllocProp<Specialize,T> ; + + ViewCtorProp( const type & arg ) : value( arg ) {} + ViewCtorProp( type && arg ) : value( arg ) {} + + type value ; +}; + /* std::integral_constant<unsigned,I> are dummy arguments * that avoid duplicate base class errors */ diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp index 900bd88f1c164891e8f14310da62fb6bbe9ef683..d346f9e6393530f4489d027db2c4d7f9892b15d1 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -62,7 +62,6 @@ //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { template< unsigned I , size_t ... Args > @@ -250,7 +249,7 @@ struct ViewDimensionAssignable< ViewDimension< DstArgs ... > }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -266,14 +265,11 @@ struct ALL_t { }} // namespace Kokkos::Impl namespace Kokkos { -namespace Experimental { namespace Impl { -using Kokkos::Impl::ALL_t ; - template< class T > struct is_integral_extent_type -{ enum { value = std::is_same<T,Kokkos::Experimental::Impl::ALL_t>::value ? 1 : 0 }; }; +{ enum { value = std::is_same<T,Kokkos::Impl::ALL_t>::value ? 1 : 0 }; }; template< class iType > struct is_integral_extent_type< std::pair<iType,iType> > @@ -314,10 +310,10 @@ struct SubviewLegalArgsCompileTime; template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs> struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> { - enum { value =(((CurrentArg==RankDest-1) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value)) || + enum { value =(((CurrentArg==RankDest-1) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) || ((CurrentArg>=RankDest) && (std::is_integral<Arg>::value)) || ((CurrentArg<RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value)) || - ((CurrentArg==0) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value)) + ((CurrentArg==0) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ) && (SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)}; }; @@ -331,7 +327,7 @@ struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankD template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs> struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> { - enum { value =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value)) || + enum { value =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) || ((CurrentArg<RankSrc-RankDest) && (std::is_integral<Arg>::value)) || ((CurrentArg>=RankSrc-RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value)) ) && (SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)}; @@ -403,7 +399,7 @@ private: bool set( unsigned domain_rank , unsigned range_rank , const ViewDimension< DimArgs ... > & dim - , const Kokkos::Experimental::Impl::ALL_t + , const Kokkos::Impl::ALL_t , Args ... args ) { m_begin[ domain_rank ] = 0 ; @@ -519,7 +515,7 @@ private: , unsigned domain_rank , unsigned range_rank , const ViewDimension< DimArgs ... > & dim - , const Kokkos::Experimental::Impl::ALL_t + , const Kokkos::Impl::ALL_t , Args ... args ) const { const int n = std::min( buf_len , @@ -670,13 +666,12 @@ public: { return unsigned(i) < InternalRangeRank ? m_index[i] : ~0u ; } }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { /** \brief Given a value type and dimension generate the View data type */ @@ -814,13 +809,12 @@ public: typedef non_const_type non_const_scalar_array_type ; }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { template < class Dimension , class Layout , typename Enable = void > @@ -1228,14 +1222,14 @@ private: // If memory alignment is a multiple of the trivial scalar size then attempt to align. enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 }; - enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr + enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr KOKKOS_INLINE_FUNCTION static constexpr size_t stride( size_t const N ) - { - return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) ) - ? N + align - ( N % div_ok ) : N ; - } + { + return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) ) + ? N + align - ( N % div_ok ) : N ; + } }; public: @@ -1707,12 +1701,12 @@ private: // If memory alignment is a multiple of the trivial scalar size then attempt to align. enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 }; - enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr + enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr KOKKOS_INLINE_FUNCTION static constexpr size_t stride( size_t const N ) { - return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) ) + return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) ) ? N + align - ( N % div_ok ) : N ; } }; @@ -2225,13 +2219,12 @@ public: {} }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { /** \brief ViewDataHandle provides the type of the 'data handle' which the view @@ -2422,13 +2415,12 @@ struct ViewDataHandle< Traits , return handle_type( arg_data_ptr + offset ); } }; -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { //---------------------------------------------------------------------------- @@ -2451,8 +2443,9 @@ template< class ExecSpace , class ValueType > struct ViewValueFunctor< ExecSpace , ValueType , false /* is_scalar */ > { typedef Kokkos::RangePolicy< ExecSpace > PolicyType ; + typedef typename ExecSpace::execution_space Exec; - ExecSpace space ; + Exec space ; ValueType * ptr ; size_t n ; bool destroy ; @@ -2597,6 +2590,9 @@ private: public: + typedef void printable_label_typedef; + enum { is_managed = Traits::is_managed }; + //---------------------------------------- // Domain dimensions @@ -2944,7 +2940,7 @@ public: Kokkos::abort("View Assignment: trying to assign runtime dimension to non matching compile time dimension."); } dst.m_offset = dst_offset_type( src.m_offset ); - dst.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track ); + dst.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track ); } }; @@ -3102,7 +3098,7 @@ public: //---------------------------------------------------------------------------- -}}} // namespace Kokkos::Experimental::Impl +}} // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -3151,6 +3147,77 @@ void view_error_operator_bounds view_error_operator_bounds<R+1>(buf+n,len-n,map,args...); } +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + +/* Check #3: is the View managed as determined by the MemoryTraits? */ +template< class MapType, + bool is_managed = (MapType::is_managed != 0) > +struct OperatorBoundsErrorOnDevice; + +template< class MapType > +struct OperatorBoundsErrorOnDevice< MapType, false > { +KOKKOS_INLINE_FUNCTION +static void run(MapType const&) { + Kokkos::abort("View bounds error"); +} +}; + +template< class MapType > +struct OperatorBoundsErrorOnDevice< MapType, true > { +KOKKOS_INLINE_FUNCTION +static void run(MapType const& map) { + char const* const user_alloc_start = reinterpret_cast<char const*>(map.data()); + char const* const header_start = user_alloc_start - sizeof(SharedAllocationHeader); + SharedAllocationHeader const* const header = + reinterpret_cast<SharedAllocationHeader const*>(header_start); + char const* const label = header->label(); + enum { LEN = 128 }; + char msg[LEN]; + char const* const first_part = "View bounds error of view "; + char* p = msg; + char* const end = msg + LEN - 1; + for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) { + *p = *p2; + } + for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) { + *p = *p2; + } + *p = '\0'; + Kokkos::abort(msg); +} +}; + +/* Check #2: does the ViewMapping have the printable_label_typedef defined? + See above that only the non-specialized standard-layout ViewMapping has + this defined by default. + The existence of this typedef indicates the existence of MapType::is_managed */ +template< class T, class Enable = void > +struct has_printable_label_typedef : public std::false_type {}; + +template<class T> +struct has_printable_label_typedef< + T, typename enable_if_type<typename T::printable_label_typedef>::type> + : public std::true_type +{}; + +template< class MapType > +KOKKOS_INLINE_FUNCTION +void operator_bounds_error_on_device( + MapType const&, + std::false_type) { + Kokkos::abort("View bounds error"); +} + +template< class MapType > +KOKKOS_INLINE_FUNCTION +void operator_bounds_error_on_device( + MapType const& map, + std::true_type) { + OperatorBoundsErrorOnDevice< MapType >::run(map); +} + +#endif // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + template< class MemorySpace , class MapType , class ... Args > KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds @@ -3166,7 +3233,17 @@ void view_verify_operator_bounds view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... ); Kokkos::Impl::throw_runtime_exception(std::string(buffer)); #else - Kokkos::abort("View bounds error"); + /* Check #1: is there a SharedAllocationRecord? + (we won't use it, but if its not there then there isn't + a corresponding SharedAllocationHeader containing a label). + This check should cover the case of Views that don't + have the Unmanaged trait but were initialized by pointer. */ + if (tracker.has_record()) { + operator_bounds_error_on_device<MapType>( + map, has_printable_label_typedef<MapType>()); + } else { + Kokkos::abort("View bounds error"); + } #endif } } diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp index ecbcf72fe0b6ad92b6ec074f7a1b6b5dcca3322a..5a8600e0aee34719e512229292ecb37d9b950657 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -48,7 +48,6 @@ //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { // View mapping for rank two tiled array @@ -195,11 +194,9 @@ struct ViewMapping }; } /* namespace Impl */ -} /* namespace Experimental */ } /* namespace Kokkos */ namespace Kokkos { -namespace Experimental { template< typename T , unsigned N0 , unsigned N1 , class ... P > KOKKOS_INLINE_FUNCTION @@ -217,7 +214,6 @@ tile_subview( const Kokkos::View<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...> & ( src , SrcLayout() , i_tile0 , i_tile1 ); } -} /* namespace Experimental */ } /* namespace Kokkos */ //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp deleted file mode 100644 index 101b714fcd7007486b9b6aef659108dd87643818..0000000000000000000000000000000000000000 --- a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp +++ /dev/null @@ -1,183 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#include <Kokkos_Macros.hpp> -#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) - -#include <impl/Kokkos_spinwait.hpp> - -#include <Kokkos_Atomic.hpp> -#include <impl/Kokkos_BitOps.hpp> - -/*--------------------------------------------------------------------------*/ - -#if !defined( _WIN32 ) - #if defined( KOKKOS_ENABLE_ASM ) - #if defined( __arm__ ) || defined( __aarch64__ ) - /* No-operation instruction to idle the thread. */ - #define KOKKOS_INTERNAL_PAUSE - #else - /* Pause instruction to prevent excess processor bus usage */ - #define KOKKOS_INTERNAL_PAUSE asm volatile("pause\n":::"memory") - #endif - #define KOKKOS_INTERNAL_NOP2 asm volatile("nop\n" "nop\n") - #define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2 - #define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4; - #define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8; - #define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16; - namespace { - inline void kokkos_internal_yield( const unsigned i ) noexcept { - switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) { - case 0u: KOKKOS_INTERNAL_NOP2; break; - case 1u: KOKKOS_INTERNAL_NOP4; break; - case 2u: KOKKOS_INTERNAL_NOP8; break; - case 3u: KOKKOS_INTERNAL_NOP16; break; - default: KOKKOS_INTERNAL_NOP32; - } - KOKKOS_INTERNAL_PAUSE; - } - } - #else - #include <sched.h> - namespace { - inline void kokkos_internal_yield( const unsigned ) noexcept { - sched_yield(); - } - } - #endif -#else // defined( _WIN32 ) - #if defined ( KOKKOS_ENABLE_WINTHREAD ) - #include <process.h> - namespace { - inline void kokkos_internal_yield( const unsigned ) noexcept { - Sleep(0); - } - } - #elif defined( _MSC_VER ) - #define NOMINMAX - #include <winsock2.h> - #include <windows.h> - namespace { - inline void kokkos_internal_yield( const unsigned ) noexcept { - YieldProcessor(); - } - } - #else - #define KOKKOS_INTERNAL_PAUSE __asm__ __volatile__("pause\n":::"memory") - #define KOKKOS_INTERNAL_NOP2 __asm__ __volatile__("nop\n" "nop") - #define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2 - #define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4; - #define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8; - #define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16; - namespace { - inline void kokkos_internal_yield( const unsigned i ) noexcept { - switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) { - case 0: KOKKOS_INTERNAL_NOP2; break; - case 1: KOKKOS_INTERNAL_NOP4; break; - case 2: KOKKOS_INTERNAL_NOP8; break; - case 3: KOKKOS_INTERNAL_NOP16; break; - default: KOKKOS_INTERNAL_NOP32; - } - KOKKOS_INTERNAL_PAUSE; - } - } - #endif -#endif - - -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Impl { - -void spinwait_while_equal( volatile int32_t & flag , const int32_t value ) -{ - Kokkos::store_fence(); - unsigned i = 0; - while ( value == flag ) { - kokkos_internal_yield(i); - ++i; - } - Kokkos::load_fence(); -} - -void spinwait_until_equal( volatile int32_t & flag , const int32_t value ) -{ - Kokkos::store_fence(); - unsigned i = 0; - while ( value != flag ) { - kokkos_internal_yield(i); - ++i; - } - Kokkos::load_fence(); -} - -void spinwait_while_equal( volatile int64_t & flag , const int64_t value ) -{ - Kokkos::store_fence(); - unsigned i = 0; - while ( value == flag ) { - kokkos_internal_yield(i); - ++i; - } - Kokkos::load_fence(); -} - -void spinwait_until_equal( volatile int64_t & flag , const int64_t value ) -{ - Kokkos::store_fence(); - unsigned i = 0; - while ( value != flag ) { - kokkos_internal_yield(i); - ++i; - } - Kokkos::load_fence(); -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -#else -void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {} -#endif - diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt index 5d6f25ac958e9abac8e87127fb815871b11692b9..475b6bb48a2eee3583da12aa4baae3f3ab4e8536 100644 --- a/lib/kokkos/core/unit_test/CMakeLists.txt +++ b/lib/kokkos/core/unit_test/CMakeLists.txt @@ -57,6 +57,7 @@ IF(Kokkos_ENABLE_Serial) serial/TestSerial_ViewMapping_b.cpp serial/TestSerial_ViewMapping_subview.cpp serial/TestSerial_ViewOfClass.cpp + serial/TestSerial_WorkGraph.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " @@ -102,6 +103,7 @@ IF(Kokkos_ENABLE_Pthread) threads/TestThreads_ViewMapping_b.cpp threads/TestThreads_ViewMapping_subview.cpp threads/TestThreads_ViewOfClass.cpp + threads/TestThreads_WorkGraph.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " @@ -147,6 +149,8 @@ IF(Kokkos_ENABLE_OpenMP) openmp/TestOpenMP_ViewMapping_b.cpp openmp/TestOpenMP_ViewMapping_subview.cpp openmp/TestOpenMP_ViewOfClass.cpp + openmp/TestOpenMP_WorkGraph.cpp + openmp/TestOpenMP_UniqueToken.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " @@ -237,6 +241,7 @@ IF(Kokkos_ENABLE_Cuda) cuda/TestCuda_ViewMapping_b.cpp cuda/TestCuda_ViewMapping_subview.cpp cuda/TestCuda_ViewOfClass.cpp + cuda/TestCuda_WorkGraph.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " @@ -253,6 +258,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( default/TestDefaultDeviceType_b.cpp default/TestDefaultDeviceType_c.cpp default/TestDefaultDeviceType_d.cpp + default/TestDefaultDeviceTypeResize.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile index 41f192a4866ceaebd18211448daa5abe01ff16f4..c877aa7dd2d252e0d19970becbc4deaac41f2d45 100644 --- a/lib/kokkos/core/unit_test/Makefile +++ b/lib/kokkos/core/unit_test/Makefile @@ -62,8 +62,9 @@ endif OBJ_CUDA += TestCuda_TeamReductionScan.o OBJ_CUDA += TestCuda_Other.o OBJ_CUDA += TestCuda_MDRange.o - OBJ_CUDA += TestCuda_Task.o + OBJ_CUDA += TestCuda_Task.o TestCuda_WorkGraph.o OBJ_CUDA += TestCuda_Spaces.o + OBJ_CUDA += TestCuda_UniqueToken.o TARGETS += KokkosCore_UnitTest_Cuda @@ -121,7 +122,8 @@ endif OBJ_OPENMP += TestOpenMP_TeamReductionScan.o OBJ_OPENMP += TestOpenMP_Other.o OBJ_OPENMP += TestOpenMP_MDRange.o - OBJ_OPENMP += TestOpenMP_Task.o + OBJ_OPENMP += TestOpenMP_Task.o TestOpenMP_WorkGraph.o + OBJ_OPENMP += TestOpenMP_UniqueToken.o TARGETS += KokkosCore_UnitTest_OpenMP @@ -208,7 +210,7 @@ endif OBJ_SERIAL += TestSerial_TeamReductionScan.o OBJ_SERIAL += TestSerial_Other.o OBJ_SERIAL += TestSerial_MDRange.o - OBJ_SERIAL += TestSerial_Task.o + OBJ_SERIAL += TestSerial_Task.o TestSerial_WorkGraph.o TARGETS += KokkosCore_UnitTest_Serial diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp index 6896a27bfb027f676381d445f2140921f0a289da..87440c36be645e6911ba3ff1c13ef2ca568a572f 100644 --- a/lib/kokkos/core/unit_test/TestAggregate.hpp +++ b/lib/kokkos/core/unit_test/TestAggregate.hpp @@ -58,7 +58,7 @@ template< class DeviceType > void TestViewAggregate() { typedef Kokkos::Array< double, 32 > value_type; - typedef Kokkos::Experimental::Impl::ViewDataAnalysis< value_type *, Kokkos::LayoutLeft, value_type > analysis_1d; + typedef Kokkos::Impl::ViewDataAnalysis< value_type *, Kokkos::LayoutLeft, value_type > analysis_1d; static_assert( std::is_same< typename analysis_1d::specialize, Kokkos::Array<> >::value, "" ); diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp index 401da58a5838d7cab5adaf38a00d4231f51721d2..68864c8d66b622426382da2381164ee0933cc3f1 100644 --- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp @@ -186,6 +186,21 @@ void check_correct_initialization( const Kokkos::InitArguments & argstruct ) { // Figure out the number of threads the HostSpace ExecutionSpace should have initialized to. int expected_nthreads = argstruct.num_threads; +#ifdef KOKKOS_ENABLE_OPENMP + if ( std::is_same< Kokkos::HostSpace::execution_space, Kokkos::OpenMP >::value ) { + // use openmp default num threads + if ( expected_nthreads < 0 || ( expected_nthreads == 0 && !Kokkos::hwloc::available() ) ) { + expected_nthreads = omp_get_max_threads(); + } + // use hwloc if available + else if ( expected_nthreads == 0 && Kokkos::hwloc::available() ) { + expected_nthreads = Kokkos::hwloc::get_available_numa_count() + * Kokkos::hwloc::get_available_cores_per_numa() + * Kokkos::hwloc::get_available_threads_per_core(); + } + } +#endif + if ( expected_nthreads < 1 ) { if ( Kokkos::hwloc::available() ) { expected_nthreads = Kokkos::hwloc::get_available_numa_count() @@ -193,12 +208,6 @@ void check_correct_initialization( const Kokkos::InitArguments & argstruct ) { * Kokkos::hwloc::get_available_threads_per_core(); } else { -#ifdef KOKKOS_ENABLE_OPENMP - if ( std::is_same< Kokkos::HostSpace::execution_space, Kokkos::OpenMP >::value ) { - expected_nthreads = omp_get_max_threads(); - } - else -#endif expected_nthreads = 1; } diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp index 091591bcbf5b5f260bb79c49d342b58cfc03f03b..f579ddf02c8242a43931c5177cfc3cad58c45078 100644 --- a/lib/kokkos/core/unit_test/TestMDRange.hpp +++ b/lib/kokkos/core/unit_test/TestMDRange.hpp @@ -51,6 +51,180 @@ namespace Test { namespace { +template <typename ExecSpace > +struct TestMDRange_ReduceArray_2D { + + using DataType = int; + using ViewType_2 = typename Kokkos::View< DataType**, ExecSpace >; + using HostViewType_2 = typename ViewType_2::HostMirror; + + ViewType_2 input_view; + + using scalar_type = double; + using value_type = scalar_type[]; + const unsigned value_count; + + TestMDRange_ReduceArray_2D( const int N0, const int N1, const unsigned array_size ) + : input_view( "input_view", N0, N1 ) + , value_count( array_size ) + {} + + KOKKOS_INLINE_FUNCTION + void init( scalar_type dst[] ) const + { + for ( unsigned i = 0; i < value_count; ++i ) { + dst[i] = 0.0; + } + } + + KOKKOS_INLINE_FUNCTION + void join( volatile scalar_type dst[], + const volatile scalar_type src[] ) const + { + for ( unsigned i = 0; i < value_count; ++i ) { + dst[i] += src[i]; + } + } + + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j ) const + { + input_view( i, j ) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, value_type lsum ) const + { + lsum[0] += input_view( i, j ) * 2; //+=6 each time if InitTag => N0*N1*6 + lsum[1] += input_view( i, j ) ; //+=3 each time if InitTag => N0*N1*3 + } + + // tagged operators + struct InitTag {}; + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j ) const + { + input_view( i, j ) = 3; + } + + static void test_arrayreduce2( const int N0, const int N1 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int>, InitTag > range_type_init; + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type_init range_init( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); + range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); + + const unsigned array_size = 2; + + TestMDRange_ReduceArray_2D functor( N0, N1, array_size ); + + parallel_for( range_init, functor ); // Init the view to 3's + + double sums[ array_size ]; + parallel_reduce( range, functor, sums ); + + // Check output + //printf("Array Reduce result. N0 = %d N1 = %d N0*N1 = %d sums[0] = %lf sums[1] = %lf \n", N0, N1, N0*N1, sums[0], sums[1]); + + ASSERT_EQ( sums[0], 6 * N0 * N1 ); + ASSERT_EQ( sums[1], 3 * N0 * N1 ); + } + } +}; + +template <typename ExecSpace > +struct TestMDRange_ReduceArray_3D { + + using DataType = int; + using ViewType_3 = typename Kokkos::View< DataType***, ExecSpace >; + using HostViewType_3 = typename ViewType_3::HostMirror; + + ViewType_3 input_view; + + using scalar_type = double; + using value_type = scalar_type[]; + const unsigned value_count; + + TestMDRange_ReduceArray_3D( const int N0, const int N1, const int N2, const unsigned array_size ) + : input_view( "input_view", N0, N1, N2 ) + , value_count( array_size ) + {} + + KOKKOS_INLINE_FUNCTION + void init( scalar_type dst[] ) const + { + for ( unsigned i = 0; i < value_count; ++i ) { + dst[i] = 0.0; + } + } + + KOKKOS_INLINE_FUNCTION + void join( volatile scalar_type dst[], + const volatile scalar_type src[] ) const + { + for ( unsigned i = 0; i < value_count; ++i ) { + dst[i] += src[i]; + } + } + + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k ) const + { + input_view( i, j, k ) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k, value_type lsum ) const + { + lsum[0] += input_view( i, j, k ) * 2; //+=6 each time if InitTag => N0*N1*N2*6 + lsum[1] += input_view( i, j, k ) ; //+=3 each time if InitTag => N0*N1*N2*3 + } + + // tagged operators + struct InitTag {}; + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, const int k ) const + { + input_view( i, j, k ) = 3; + } + + static void test_arrayreduce3( const int N0, const int N1, const int N2 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int>, InitTag > range_type_init; + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type_init range_init( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + + const unsigned array_size = 2; + + TestMDRange_ReduceArray_3D functor( N0, N1, N2, array_size ); + + parallel_for( range_init, functor ); // Init the view to 3's + + double sums[ array_size ]; + parallel_reduce( range, functor, sums ); + + ASSERT_EQ( sums[0], 6 * N0 * N1 * N2 ); + ASSERT_EQ( sums[1], 3 * N0 * N1 * N2 ); + } + } +}; + + template <typename ExecSpace > struct TestMDRange_2D { using DataType = int; @@ -58,6 +232,7 @@ struct TestMDRange_2D { using HostViewType = typename ViewType::HostMirror; ViewType input_view; + using value_type = double; TestMDRange_2D( const DataType N0, const DataType N1 ) : input_view( "input_view", N0, N1 ) {} @@ -68,7 +243,7 @@ struct TestMDRange_2D { } KOKKOS_INLINE_FUNCTION - void operator()( const int i, const int j, double &lsum ) const + void operator()( const int i, const int j, value_type &lsum ) const { lsum += input_view( i, j ) * 2; } @@ -81,6 +256,13 @@ struct TestMDRange_2D { input_view( i, j ) = 3; } + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, value_type &lsum ) const + { + lsum += input_view( i, j ) * 3; + } + static void test_reduce2( const int N0, const int N1 ) { using namespace Kokkos::Experimental; @@ -94,13 +276,85 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } + // Test with reducers - scalar + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type; + range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} ); + + TestMDRange_2D functor( N0, N1 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::Experimental::Sum< value_type > reducer_scalar( sum ); + + parallel_reduce( range, functor, reducer_scalar ); + + ASSERT_EQ( sum, 2 * N0 * N1 ); + } + // Test with reducers - scalar view + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type; + range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} ); + + TestMDRange_2D functor( N0, N1 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view"); + sum_view() = sum; + Kokkos::Experimental::Sum< value_type > reducer_view( sum_view ); + + parallel_reduce( range, functor, reducer_view); + sum = sum_view(); + + ASSERT_EQ( sum, 2 * N0 * N1 ); + } + + // Tagged operator test + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 4 } } ); + + TestMDRange_2D functor( N0, N1 ); + + parallel_for( range, functor ); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view, functor.input_view ); + int counter = 0; + for ( int i = 0; i < N0; ++i ) + for ( int j = 0; j < N1; ++j ) + { + if ( h_view( i, j ) != 3 ) { + ++counter; + } + } + + if ( counter != 0 ) { + printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); + } + ASSERT_EQ( counter, 0 ); + + + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 9 * N0 * N1 ); + } + { typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type; typedef typename range_type::tile_type tile_type; @@ -110,9 +364,9 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } @@ -126,9 +380,9 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } @@ -142,9 +396,9 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } @@ -158,9 +412,9 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } @@ -174,9 +428,9 @@ struct TestMDRange_2D { TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 ); } @@ -194,7 +448,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -223,7 +477,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -251,7 +505,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -280,7 +534,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -309,7 +563,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 4, 4 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -338,7 +592,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -367,7 +621,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 7, 7 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -396,7 +650,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 16, 16 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -425,7 +679,7 @@ struct TestMDRange_2D { range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 5, 16 } } ); TestMDRange_2D functor( N0, N1 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -455,6 +709,7 @@ struct TestMDRange_3D { using HostViewType = typename ViewType::HostMirror; ViewType input_view; + using value_type = double; TestMDRange_3D( const DataType N0, const DataType N1, const DataType N2 ) : input_view( "input_view", N0, N1, N2 ) {} @@ -478,6 +733,13 @@ struct TestMDRange_3D { input_view( i, j, k ) = 3; } + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, const int k, value_type &lsum ) const + { + lsum += input_view( i, j, k ) * 3; + } + static void test_reduce3( const int N0, const int N1, const int N2 ) { using namespace Kokkos::Experimental; @@ -491,13 +753,86 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } + // Test with reducers - scalar + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type; + range_type range( {{ 0, 0, 0 }}, {{ N0, N1, N2 }}, {{ 3, 3, 3 }} ); + + TestMDRange_3D functor( N0, N1, N2 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::Experimental::Sum< value_type > reducer_scalar( sum ); + + parallel_reduce( range, functor, reducer_scalar ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); + } + // Test with reducers - scalar view + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type; + range_type range( {{ 0, 0, 0 }}, {{ N0, N1, N2 }}, {{ 3, 3, 3 }} ); + + TestMDRange_3D functor( N0, N1, N2 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view"); + sum_view() = sum; + Kokkos::Experimental::Sum< value_type > reducer_view( sum_view ); + + parallel_reduce( range, functor, reducer_view); + sum = sum_view(); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); + } + + // Tagged operator test + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } ); + + TestMDRange_3D functor( N0, N1, N2 ); + + parallel_for( range, functor ); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view, functor.input_view ); + int counter = 0; + for ( int i = 0; i < N0; ++i ) + for ( int j = 0; j < N1; ++j ) + for ( int k = 0; k < N2; ++k ) + { + if ( h_view( i, j, k ) != 3 ) { + ++counter; + } + } + + if ( counter != 0 ) { + printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); + } + ASSERT_EQ( counter, 0 ); + + + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 9 * N0 * N1 * N2 ); + } + { typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> > range_type; typedef typename range_type::tile_type tile_type; @@ -507,9 +842,9 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } @@ -523,9 +858,9 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } @@ -539,9 +874,9 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } @@ -555,9 +890,9 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } @@ -571,9 +906,9 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); double sum = 0.0; - md_parallel_reduce( range, functor, sum ); + parallel_reduce( range, functor, sum ); ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); } @@ -590,7 +925,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -620,7 +955,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -651,7 +986,7 @@ struct TestMDRange_3D { TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -681,7 +1016,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -711,7 +1046,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -741,7 +1076,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 5, 7 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -771,7 +1106,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 8, 8, 8 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -801,7 +1136,7 @@ struct TestMDRange_3D { range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } ); TestMDRange_3D functor( N0, N1, N2 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -832,6 +1167,7 @@ struct TestMDRange_4D { using HostViewType = typename ViewType::HostMirror; ViewType input_view; + using value_type = double; TestMDRange_4D( const DataType N0, const DataType N1, const DataType N2, const DataType N3 ) : input_view( "input_view", N0, N1, N2, N3 ) {} @@ -855,6 +1191,191 @@ struct TestMDRange_4D { input_view( i, j, k, l ) = 3; } + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, const int k, const int l, value_type &lsum ) const + { + lsum += input_view( i, j, k, l ) * 3; + } + + static void test_reduce4( const int N0, const int N1, const int N2, const int N3 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int> > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + // Test with reducers - scalar + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int> > range_type; + range_type range( {{ 0, 0, 0, 0 }}, {{ N0, N1, N2, N3 }}, {{ 3, 3, 3, 3 }} ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::Experimental::Sum< value_type > reducer_scalar( sum ); + + parallel_reduce( range, functor, reducer_scalar ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + // Test with reducers - scalar view + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int> > range_type; + range_type range( {{ 0, 0, 0, 0 }}, {{ N0, N1, N2, N3 }}, {{ 3, 3, 3, 3 }} ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view"); + sum_view() = sum; + Kokkos::Experimental::Sum< value_type > reducer_view( sum_view ); + + parallel_reduce( range, functor, reducer_view); + sum = sum_view(); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + // Tagged operator test + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view, functor.input_view ); + int counter = 0; + for ( int i = 0; i < N0; ++i ) + for ( int j = 0; j < N1; ++j ) + for ( int k = 0; k < N2; ++k ) + for ( int l = 0; l < N3; ++l ) + { + if ( h_view( i, j, k, l ) != 3 ) { + ++counter; + } + } + + if ( counter != 0 ) { + printf( "Defaults + InitTag op(): Errors in test_reduce4 parallel_for init; mismatches = %d\n\n", counter ); + } + ASSERT_EQ( counter, 0 ); + + + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 ); + } + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } ); + + TestMDRange_4D functor( N0, N1, N2, N3 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + } + } // end test_reduce + + + static void test_for4( const int N0, const int N1, const int N2, const int N3 ) { using namespace Kokkos::Experimental; @@ -866,7 +1387,7 @@ struct TestMDRange_4D { range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -897,7 +1418,7 @@ struct TestMDRange_4D { range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -929,7 +1450,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -961,7 +1482,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -993,7 +1514,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1025,7 +1546,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1057,7 +1578,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1089,7 +1610,7 @@ struct TestMDRange_4D { TestMDRange_4D functor( N0, N1, N2, N3 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1121,6 +1642,7 @@ struct TestMDRange_5D { using HostViewType = typename ViewType::HostMirror; ViewType input_view; + using value_type = double; TestMDRange_5D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4 ) : input_view( "input_view", N0, N1, N2, N3, N4 ) {} @@ -1131,7 +1653,7 @@ struct TestMDRange_5D { } KOKKOS_INLINE_FUNCTION - void operator()( const int i, const int j, const int k, const int l, const int m, double &lsum ) const + void operator()( const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const { lsum += input_view( i, j, k, l, m ) * 2; } @@ -1144,6 +1666,110 @@ struct TestMDRange_5D { input_view( i, j, k, l, m ) = 3; } + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const + { + lsum += input_view( i, j, k, l, m ) * 3; + } + + static void test_reduce5( const int N0, const int N1, const int N2, const int N3, const int N4 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int> > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } ); + + TestMDRange_5D functor( N0, N1, N2, N3, N4 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 ); + } + + // Test with reducers - scalar + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int> > range_type; + range_type range( {{ 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4 }}, {{ 3, 3, 3, 3, 3 }} ); + + TestMDRange_5D functor( N0, N1, N2, N3, N4 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::Experimental::Sum< value_type > reducer_scalar( sum ); + + parallel_reduce( range, functor, reducer_scalar ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 ); + } + + // Test with reducers - scalar view + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int> > range_type; + range_type range( {{ 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4 }}, {{ 3, 3, 3, 3, 3 }} ); + + TestMDRange_5D functor( N0, N1, N2, N3, N4 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view"); + sum_view() = sum; + Kokkos::Experimental::Sum< value_type > reducer_view( sum_view ); + + parallel_reduce( range, functor, reducer_view); + sum = sum_view(); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 ); + } + + // Tagged operator test + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 2, 4, 6, 2, 2 } } ); + + TestMDRange_5D functor( N0, N1, N2, N3, N4 ); + + parallel_for( range, functor ); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view, functor.input_view ); + int counter = 0; + for ( int i = 0; i < N0; ++i ) + for ( int j = 0; j < N1; ++j ) + for ( int k = 0; k < N2; ++k ) + for ( int l = 0; l < N3; ++l ) + for ( int m = 0; m < N4; ++m ) + { + if ( h_view( i, j, k, l, m ) != 3 ) { + ++counter; + } + } + + if ( counter != 0 ) { + printf( "Defaults + InitTag op(): Errors in test_reduce5 parallel_for init; mismatches = %d\n\n", counter ); + } + ASSERT_EQ( counter, 0 ); + + + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 * N4 ); + } + } + static void test_for5( const int N0, const int N1, const int N2, const int N3, const int N4 ) { using namespace Kokkos::Experimental; @@ -1155,7 +1781,7 @@ struct TestMDRange_5D { range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1184,10 +1810,10 @@ struct TestMDRange_5D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 7 } } ); + range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1220,7 +1846,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1253,7 +1879,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1286,7 +1912,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1319,7 +1945,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1352,7 +1978,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1385,7 +2011,7 @@ struct TestMDRange_5D { TestMDRange_5D functor( N0, N1, N2, N3, N4 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1418,6 +2044,7 @@ struct TestMDRange_6D { using HostViewType = typename ViewType::HostMirror; ViewType input_view; + using value_type = double; TestMDRange_6D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4, const DataType N5 ) : input_view( "input_view", N0, N1, N2, N3, N4, N5 ) {} @@ -1428,7 +2055,7 @@ struct TestMDRange_6D { } KOKKOS_INLINE_FUNCTION - void operator()( const int i, const int j, const int k, const int l, const int m, const int n, double &lsum ) const + void operator()( const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const { lsum += input_view( i, j, k, l, m, n ) * 2; } @@ -1441,6 +2068,111 @@ struct TestMDRange_6D { input_view( i, j, k, l, m, n ) = 3; } + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const + { + lsum += input_view( i, j, k, l, m, n ) * 3; + } + + static void test_reduce6( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int> > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } ); + + TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 ); + } + + // Test with reducers - scalar + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int> > range_type; + range_type range( {{ 0, 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4, N5 }}, {{ 3, 3, 3, 3, 3, 2 }} ); + + TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::Experimental::Sum< value_type > reducer_scalar( sum ); + + parallel_reduce( range, functor, reducer_scalar ); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 ); + } + + // Test with reducers - scalar view + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int> > range_type; + range_type range( {{ 0, 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4, N5 }}, {{ 3, 3, 3, 3, 3, 2 }} ); + + TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); + + parallel_for( range, functor ); + + value_type sum = 0.0; + Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view"); + sum_view() = sum; + Kokkos::Experimental::Sum< value_type > reducer_view( sum_view ); + + parallel_reduce( range, functor, reducer_view); + sum = sum_view(); + + ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 ); + } + + // Tagged operator test + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 2, 4, 6, 2, 2, 2 } } ); + + TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); + + parallel_for( range, functor ); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); + Kokkos::deep_copy( h_view, functor.input_view ); + int counter = 0; + for ( int i = 0; i < N0; ++i ) + for ( int j = 0; j < N1; ++j ) + for ( int k = 0; k < N2; ++k ) + for ( int l = 0; l < N3; ++l ) + for ( int m = 0; m < N4; ++m ) + for ( int n = 0; n < N5; ++n ) + { + if ( h_view( i, j, k, l, m, n ) != 3 ) { + ++counter; + } + } + + if ( counter != 0 ) { + printf( "Defaults + InitTag op(): Errors in test_reduce6 parallel_for init; mismatches = %d\n\n", counter ); + } + ASSERT_EQ( counter, 0 ); + + + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 * N4 * N5 ); + } + } + static void test_for6( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 ) { using namespace Kokkos::Experimental; @@ -1452,7 +2184,7 @@ struct TestMDRange_6D { range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } } ); TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1485,7 +2217,7 @@ struct TestMDRange_6D { range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1519,7 +2251,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1553,7 +2285,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1587,7 +2319,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1621,7 +2353,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1655,7 +2387,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1689,7 +2421,7 @@ struct TestMDRange_6D { TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); - md_parallel_for( range, functor ); + parallel_for( range, functor ); HostViewType h_view = Kokkos::create_mirror_view( functor.input_view ); Kokkos::deep_copy( h_view, functor.input_view ); @@ -1726,11 +2458,19 @@ TEST_F( TEST_CATEGORY , mdrange_for ) { TestMDRange_6D< TEST_EXECSPACE >::test_for6( 10, 10, 10, 10, 5, 5 ); } -#ifndef KOKKOS_ENABLE_CUDA TEST_F( TEST_CATEGORY , mdrange_reduce ) { TestMDRange_2D< TEST_EXECSPACE >::test_reduce2( 100, 100 ); TestMDRange_3D< TEST_EXECSPACE >::test_reduce3( 100, 10, 100 ); + TestMDRange_4D< TEST_EXECSPACE >::test_reduce4( 100, 10, 10, 10 ); + TestMDRange_5D< TEST_EXECSPACE >::test_reduce5( 100, 10, 10, 10, 5 ); + TestMDRange_6D< TEST_EXECSPACE >::test_reduce6( 100, 10, 10, 10, 5, 5 ); +} + +//#ifndef KOKKOS_ENABLE_CUDA +TEST_F( TEST_CATEGORY , mdrange_array_reduce ) { + TestMDRange_ReduceArray_2D< TEST_EXECSPACE >::test_arrayreduce2( 4, 5 ); + TestMDRange_ReduceArray_3D< TEST_EXECSPACE >::test_arrayreduce3( 4, 5, 10 ); } -#endif +//#endif } // namespace Test diff --git a/lib/kokkos/core/unit_test/TestMemoryPool.hpp b/lib/kokkos/core/unit_test/TestMemoryPool.hpp index 941cd6c26d4a5ada4dd3ac5b583033cd099e4d8f..9f708390c2d6d448c18c92b908609e160557937e 100644 --- a/lib/kokkos/core/unit_test/TestMemoryPool.hpp +++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp @@ -54,6 +54,96 @@ namespace TestMemoryPool { +template< typename MemSpace = Kokkos::HostSpace > +void test_host_memory_pool_defaults() +{ + typedef typename MemSpace::execution_space Space ; + typedef typename Kokkos::MemoryPool< Space > MemPool ; + + { + const size_t MemoryCapacity = 32000 ; + const size_t MinBlockSize = 64 ; + const size_t MaxBlockSize = 1024 ; + const size_t SuperBlockSize = 4096 ; + + MemPool pool( MemSpace() + , MemoryCapacity + , MinBlockSize + , MaxBlockSize + , SuperBlockSize + ); + + typename MemPool::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + ASSERT_LE( MemoryCapacity , stats.capacity_bytes ); + ASSERT_LE( MinBlockSize , stats.min_block_bytes ); + ASSERT_LE( MaxBlockSize , stats.max_block_bytes ); + ASSERT_LE( SuperBlockSize , stats.superblock_bytes ); + } + + { + const size_t MemoryCapacity = 10000 ; + + MemPool pool( MemSpace() + , MemoryCapacity + ); + + typename MemPool::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + ASSERT_LE( MemoryCapacity , stats.capacity_bytes ); + ASSERT_LE( 64u /* default */ , stats.min_block_bytes ); + ASSERT_LE( stats.min_block_bytes , stats.max_block_bytes ); + ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes ); + ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes ); + } + + { + const size_t MemoryCapacity = 10000 ; + const size_t MinBlockSize = 32 ; // power of two is exact + + MemPool pool( MemSpace() + , MemoryCapacity + , MinBlockSize + ); + + typename MemPool::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + ASSERT_LE( MemoryCapacity , stats.capacity_bytes ); + ASSERT_EQ( MinBlockSize , stats.min_block_bytes ); + ASSERT_LE( stats.min_block_bytes , stats.max_block_bytes ); + ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes ); + ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes ); + } + + { + const size_t MemoryCapacity = 32000 ; + const size_t MinBlockSize = 32 ; // power of two is exact + const size_t MaxBlockSize = 1024 ; // power of two is exact + + MemPool pool( MemSpace() + , MemoryCapacity + , MinBlockSize + , MaxBlockSize + ); + + typename MemPool::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + ASSERT_LE( MemoryCapacity , stats.capacity_bytes ); + ASSERT_EQ( MinBlockSize , stats.min_block_bytes ); + ASSERT_EQ( MaxBlockSize , stats.max_block_bytes ); + ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes ); + ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes ); + } +} + template< typename MemSpace = Kokkos::HostSpace > void test_host_memory_pool_stats() { @@ -188,8 +278,8 @@ void print_memory_pool_stats << " bytes reserved = " << stats.reserved_bytes << std::endl << " bytes free = " << ( stats.capacity_bytes - ( stats.consumed_bytes + stats.reserved_bytes ) ) << std::endl - << " alloc used = " << stats.consumed_blocks << std::endl - << " alloc reserved = " << stats.reserved_blocks << std::endl + << " block used = " << stats.consumed_blocks << std::endl + << " block reserved = " << stats.reserved_blocks << std::endl << " super used = " << stats.consumed_superblocks << std::endl << " super reserved = " << ( stats.capacity_superblocks - stats.consumed_superblocks ) << std::endl @@ -302,15 +392,147 @@ void test_memory_pool_v2( const bool print_statistics //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -} // namespace TestMemoryPool { +template< class DeviceType > +struct TestMemoryPoolCorners { + + typedef Kokkos::View< uintptr_t * , DeviceType > ptrs_type ; + typedef Kokkos::MemoryPool< DeviceType > pool_type ; + + pool_type pool ; + ptrs_type ptrs ; + uint32_t size ; + uint32_t stride ; + + TestMemoryPoolCorners( const pool_type & arg_pool + , const ptrs_type & arg_ptrs + , const uint32_t arg_base + , const uint32_t arg_stride + ) + : pool( arg_pool ) + , ptrs( arg_ptrs ) + , size( arg_base ) + , stride( arg_stride ) + {} + + // Specify reduction argument value_type to + // avoid confusion with tag-dispatch. + + using value_type = long ; + + KOKKOS_INLINE_FUNCTION + void operator()( int i , long & err ) const noexcept + { + unsigned alloc_size = size << ( i % stride ); + if ( 0 == ptrs(i) ) { + ptrs(i) = (uintptr_t) pool.allocate( alloc_size ); + if ( ptrs(i) && ! alloc_size ) { ++err ; } + } + } + + struct TagDealloc {}; + + KOKKOS_INLINE_FUNCTION + void operator()( int i ) const noexcept + { + unsigned alloc_size = size << ( i % stride ); + if ( ptrs(i) ) { pool.deallocate( (void*) ptrs(i) , alloc_size ); } + ptrs(i) = 0 ; + } +}; + +template< class DeviceType > +void test_memory_pool_corners( const bool print_statistics + , const bool print_superblocks ) +{ + typedef typename DeviceType::memory_space memory_space ; + typedef typename DeviceType::execution_space execution_space ; + typedef Kokkos::MemoryPool< DeviceType > pool_type ; + typedef TestMemoryPoolCorners< DeviceType > functor_type ; + typedef typename functor_type::ptrs_type ptrs_type ; + + { + // superblock size 1 << 14 + const size_t min_superblock_size = 1u << 14 ; + + // four superblocks + const size_t total_alloc_size = min_superblock_size * 4 ; + + // block sizes { 64 , 128 , 256 , 512 } + // block counts { 256 , 128 , 64 , 32 } + const unsigned min_block_size = 64 ; + const unsigned max_block_size = 512 ; + const unsigned num_blocks = 480 ; + + pool_type pool( memory_space() + , total_alloc_size + , min_block_size + , max_block_size + , min_superblock_size ); + + // Allocate one block from each superblock to lock that + // superblock into the block size. + + ptrs_type ptrs("ptrs",num_blocks); + + long err = 0 ; + + Kokkos::parallel_reduce + ( Kokkos::RangePolicy< execution_space >(0,4) + , functor_type( pool , ptrs , 64 , 4 ) + , err + ); + + if ( print_statistics || err ) { + + typename pool_type::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + print_memory_pool_stats< pool_type >( stats ); + } + + if ( print_superblocks || err ) { + pool.print_state( std::cout ); + } + + // Now fill remaining allocations with small size + + Kokkos::parallel_reduce + ( Kokkos::RangePolicy< execution_space >(0,num_blocks) + , functor_type( pool , ptrs , 64 , 1 ) + , err + ); + + if ( print_statistics || err ) { + + typename pool_type::usage_statistics stats ; + + pool.get_usage_statistics( stats ); + + print_memory_pool_stats< pool_type >( stats ); + } + + if ( print_superblocks || err ) { + pool.print_state( std::cout ); + } + } +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +} // namespace TestMemoryPool namespace Test { TEST_F( TEST_CATEGORY, memory_pool ) { + TestMemoryPool::test_host_memory_pool_defaults<>(); TestMemoryPool::test_host_memory_pool_stats<>(); TestMemoryPool::test_memory_pool_v2< TEST_EXECSPACE >(false,false); + TestMemoryPool::test_memory_pool_corners< TEST_EXECSPACE >(false,false); } + } #endif diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp index f55574761b89170f3d6068bb649bda3944d4aed3..3cea1ad4a00e5476df6fbd5c20518c5426884cf6 100644 --- a/lib/kokkos/core/unit_test/TestRange.hpp +++ b/lib/kokkos/core/unit_test/TestRange.hpp @@ -72,8 +72,33 @@ struct TestRange { typename view_type::HostMirror host_flags = Kokkos::create_mirror_view( m_flags ); Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), *this ); + +#if defined(KOKKOS_ENABLE_PROFILING) + { + typedef TestRange< ExecSpace, ScheduleType > ThisType; + std::string label("parallel_for"); + Kokkos::Impl::ParallelConstructName< ThisType, void> pcn(label); + ASSERT_EQ( pcn.get(), label ); + std::string empty_label(""); + Kokkos::Impl::ParallelConstructName< ThisType, void> empty_pcn(empty_label); + ASSERT_EQ( empty_pcn.get(), typeid(ThisType).name() ); + } +#endif + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, VerifyInitTag >( 0, N ), *this ); +#if defined(KOKKOS_ENABLE_PROFILING) + { + typedef TestRange< ExecSpace, ScheduleType > ThisType; + std::string label("parallel_for"); + Kokkos::Impl::ParallelConstructName< ThisType, VerifyInitTag> pcn(label); + ASSERT_EQ( pcn.get(), label ); + std::string empty_label(""); + Kokkos::Impl::ParallelConstructName< ThisType, VerifyInitTag> empty_pcn(empty_label); + ASSERT_EQ( empty_pcn.get(), std::string(typeid(ThisType).name()) + "/" + typeid(VerifyInitTag).name() ); + } +#endif + Kokkos::deep_copy( host_flags, m_flags ); int error_count = 0; diff --git a/lib/kokkos/core/unit_test/TestResize.hpp b/lib/kokkos/core/unit_test/TestResize.hpp new file mode 100644 index 0000000000000000000000000000000000000000..aaf0422b19ce0e3b90d688b249de386befaaa565 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestResize.hpp @@ -0,0 +1,140 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef TESTVIEWSUBVIEW_HPP_ +#define TESTVIEWSUBVIEW_HPP_ + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +namespace TestViewResize { + +template<class DeviceType> +void testResize () +{ + const int sizes[8] = {2, 3, 4, 5, 6, 7, 8, 9}; + + // Check #904 fix (no reallocation if dimensions didn't change). + { + typedef Kokkos::View<int*, DeviceType> view_type; + view_type view_1d ("view_1d", sizes[0]); + const int* oldPointer = view_1d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_1d, sizes[0]); + const int* newPointer = view_1d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View<int**, DeviceType> view_type; + view_type view_2d ("view_2d", sizes[0], sizes[1]); + const int* oldPointer = view_2d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_2d, sizes[0], sizes[1]); + const int* newPointer = view_2d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View<int***, DeviceType> view_type; + view_type view_3d ("view_3d", sizes[0], sizes[1], sizes[2]); + const int* oldPointer = view_3d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_3d, sizes[0], sizes[1], sizes[2]); + const int* newPointer = view_3d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View<int****, DeviceType> view_type; + view_type view_4d ("view_4d", sizes[0], sizes[1], sizes[2], sizes[3]); + const int* oldPointer = view_4d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_4d, sizes[0], sizes[1], sizes[2], sizes[3]); + const int* newPointer = view_4d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View<int*****, DeviceType> view_type; + view_type view_5d ("view_5d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4]); + const int* oldPointer = view_5d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_5d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4]); + const int* newPointer = view_5d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View<int******, DeviceType> view_type; + view_type view_6d ("view_6d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5]); + const int* oldPointer = view_6d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_6d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4], + sizes[5]); + const int* newPointer = view_6d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View<int*******, DeviceType> view_type; + view_type view_7d ("view_7d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6]); + const int* oldPointer = view_7d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_7d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4], + sizes[5], sizes[6]); + const int* newPointer = view_7d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } + { + typedef Kokkos::View<int********, DeviceType> view_type; + view_type view_8d ("view_8d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6], sizes[7]); + const int* oldPointer = view_8d.data (); + EXPECT_TRUE( oldPointer != NULL ); + Kokkos::resize (view_8d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4], + sizes[5], sizes[6], sizes[7]); + const int* newPointer = view_8d.data (); + EXPECT_TRUE( oldPointer == newPointer ); + } +} + +} // namespace TestViewSubview + +#endif // TESTVIEWSUBVIEW_HPP_ diff --git a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp index 3a88475620fbf855e9a3b360d87ce164f4710376..4e6654385792b1b08c34e97c4e4011f816cebbf1 100644 --- a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp +++ b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp @@ -250,13 +250,21 @@ struct TestTaskDependence { const int n = CHUNK < m_count ? CHUNK : m_count; if ( 1 < m_count ) { - future_type f[ CHUNK ]; + // Test use of memory pool for temporary allocation: + + // Raw allocation: + future_type * const f = + (future_type *) m_sched.memory()->allocate( sizeof(future_type) * n ); + + // In-place construction: + for ( int i = 0; i < n; ++i ) new(f+i) future_type(); const int inc = ( m_count + n - 1 ) / n; for ( int i = 0; i < n; ++i ) { long begin = i * inc; long count = begin + inc < m_count ? inc : m_count - begin; + f[i] = Kokkos::task_spawn( Kokkos::TaskSingle( m_sched ) , TestTaskDependence( count, m_sched, m_accum ) ); } @@ -264,6 +272,12 @@ struct TestTaskDependence { m_count = 0; Kokkos::respawn( this, Kokkos::when_all( f, n ) ); + + // In-place destruction to release future: + for ( int i = 0; i < n; ++i ) (f+i)->~future_type(); + + // Raw deallocation: + m_sched.memory()->deallocate( f , sizeof(future_type) * n ); } else if ( 1 == m_count ) { Kokkos::atomic_increment( & m_accum() ); @@ -641,19 +655,12 @@ namespace Test { TEST_F( TEST_CATEGORY, task_fib ) { - const int N = 24 ; // 25 triggers tbd bug on Cuda/Pascal + const int N = 27 ; for ( int i = 0; i < N; ++i ) { - TestTaskScheduler::TestFib< TEST_EXECSPACE >::run( i , ( i + 1 ) * ( i + 1 ) * 10000 ); + TestTaskScheduler::TestFib< TEST_EXECSPACE >::run( i , ( i + 1 ) * ( i + 1 ) * 2000 ); } } -#if defined(KOKKOS_ARCH_MAXWELL) || defined(KOKKOS_ARCH_PASCAL) - // TODO: Resolve bug in task DAG for Pascal - #define KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL -#endif - -#ifndef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL - TEST_F( TEST_CATEGORY, task_depend ) { for ( int i = 0; i < 25; ++i ) { @@ -667,11 +674,8 @@ TEST_F( TEST_CATEGORY, task_team ) //TestTaskScheduler::TestTaskTeamValue< TEST_EXECSPACE >::run( 1000 ); // Put back after testing. } -#else //ndef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL -#undef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL -#endif //ndef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL - } + #endif // #if defined( KOKKOS_ENABLE_TASKDAG ) #endif // #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp index e9e2f7548aebf589796d51e4f5bd3490ecfd0fab..7f4663d0f90a66b74dc394109642b58887b9a83b 100644 --- a/lib/kokkos/core/unit_test/TestTeamVector.hpp +++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp @@ -838,6 +838,18 @@ public: }, result ); const ScalarType solution = (ScalarType) nrows * (ScalarType) ncols; + + if ( int64_t(solution) != int64_t(result) ) { + printf( " TestTripleNestedReduce failed solution(%ld) != result(%ld), nrows(%d) ncols(%d) league_size(%d) team_size(%d)\n" + , int64_t(solution) + , int64_t(result) + , int32_t(nrows) + , int32_t(ncols) + , int32_t(nrows/chunk_size) + , int32_t(team_size) + ); + } + ASSERT_EQ( solution, result ); } }; diff --git a/lib/kokkos/core/unit_test/TestTile.hpp b/lib/kokkos/core/unit_test/TestTile.hpp index 8f57dfea75fd362d0fc9669820e19c92bbc9ff74..f15667322fd3f423c61598daea7065de67c7f380 100644 --- a/lib/kokkos/core/unit_test/TestTile.hpp +++ b/lib/kokkos/core/unit_test/TestTile.hpp @@ -94,7 +94,7 @@ struct ReduceTileErrors const size_t jtile = iwork / tile_dim0; if ( jtile < tile_dim1 ) { - tile_type tile = Kokkos::Experimental::tile_subview( m_array, itile, jtile ); + tile_type tile = Kokkos::tile_subview( m_array, itile, jtile ); if ( tile( 0, 0 ) != ptrdiff_t( ( itile + jtile * tile_dim0 ) * TileLayout::N0 * TileLayout::N1 ) ) { ++errors; diff --git a/lib/kokkos/core/unit_test/TestUniqueToken.hpp b/lib/kokkos/core/unit_test/TestUniqueToken.hpp new file mode 100644 index 0000000000000000000000000000000000000000..28add61a8a8f74f878d02ff61307c7ad1f6937b0 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestUniqueToken.hpp @@ -0,0 +1,138 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> + +#include <Kokkos_Core.hpp> + +namespace Test { + +template< class Space > +class TestUniqueToken +{ +public: + typedef typename Space::execution_space execution_space; + typedef Kokkos::View< int * , execution_space > view_type ; + + Kokkos::Experimental::UniqueToken< execution_space , Kokkos::Experimental::UniqueTokenScope::Global > tokens ; + + view_type verify ; + view_type counts ; + view_type errors ; + + KOKKOS_INLINE_FUNCTION + void operator()( long ) const + { + const int32_t t = tokens.acquire(); + + bool ok = true ; + + ok = ok && 0 <= t ; + ok = ok && t < tokens.size(); + ok = ok && 0 == Kokkos::atomic_fetch_add( & verify(t) , 1 ); + + Kokkos::atomic_fetch_add( & counts(t) , 1 ); + + ok = ok && 1 == Kokkos::atomic_fetch_add( & verify(t) , -1 ); + + if ( ! ok ) { Kokkos::atomic_fetch_add( & errors(0) , 1 ) ; } + + tokens.release(t); + } + + TestUniqueToken() + : tokens( execution_space() ) + , verify( "TestUniqueTokenVerify" , tokens.size() ) + , counts( "TestUniqueTokenCounts" , tokens.size() ) + , errors( "TestUniqueTokenErrors" , 1 ) + {} + + static void run() + { + using policy = Kokkos::RangePolicy<execution_space> ; + + TestUniqueToken self ; + + { + const int duplicate = 100 ; + const long n = duplicate * self.tokens.size(); + + Kokkos::parallel_for( policy(0,n) , self ); + Kokkos::parallel_for( policy(0,n) , self ); + Kokkos::parallel_for( policy(0,n) , self ); + Kokkos::fence(); + } + + typename view_type::HostMirror host_counts = + Kokkos::create_mirror_view( self.counts ); + + Kokkos::deep_copy( host_counts , self.counts ); + + int32_t max = 0 ; + + { + const long n = host_counts.extent(0); + for ( long i = 0 ; i < n ; ++i ) { + if ( max < host_counts[i] ) max = host_counts[i] ; + } + } + + std::cout << "TestUniqueToken max reuse = " << max << std::endl ; + + typename view_type::HostMirror host_errors = + Kokkos::create_mirror_view( self.errors ); + + Kokkos::deep_copy( host_errors , self.errors ); + + ASSERT_EQ( host_errors(0) , 0 ); + } +}; + + +TEST_F( TEST_CATEGORY, unique_token ) +{ + TestUniqueToken< TEST_EXECSPACE >::run(); +} + +} // namespace Test + diff --git a/lib/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp b/lib/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp new file mode 100644 index 0000000000000000000000000000000000000000..305ddb2a1d09f8535f42b456ca7e9ff22ce60a34 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp @@ -0,0 +1,160 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <type_traits> +#include <typeinfo> + +namespace Test { + +namespace { + +template <typename ExecSpace > +struct TestViewCtorProp_EmbeddedDim { + + using ViewIntType = typename Kokkos::View< int**, ExecSpace >; + using ViewDoubleType = typename Kokkos::View< double*, ExecSpace >; + + // Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor + template < class ViewType > + struct Functor { + + ViewType v; + + Functor( const ViewType & v_ ) : v(v_) {} + + KOKKOS_INLINE_FUNCTION + void operator()( const int i ) const { + v(i) = i; + } + + }; + + + static void test_vcpt( const int N0, const int N1 ) + { + + // Create views to test + { + using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ; + using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ; + + VIT vi1("vi1", N0, N1); + VDT vd1("vd1", N0); + + // TEST: Test for common type between two views, one with type double, other with type int + // Deduce common value_type and construct a view with that type + { + // Two views + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1); + typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType; + typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT; + typedef typename CVT::HostMirror HostCVT; + + // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg + CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 ); + + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), + Functor<CVT>(cv1) + ); + + HostCVT hcv1 = Kokkos::create_mirror_view( cv1 ); + Kokkos::deep_copy( hcv1, cv1 ); + + ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ; + #if 0 + // debug output + for ( int i = 0; i < N0*N1; ++i ) { + printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) ); + } + + printf( " Common value type view: %s \n", typeid( CVT() ).name() ); + printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); + if ( std::is_same< CommonViewValueType, double >::value == true ) { + printf("Proper common value_type\n"); + } + else { + printf("WRONG common value_type\n"); + } + // end debug output + #endif + } + + { + // Single view + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1); + typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType; + typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT; + typedef typename CVT::HostMirror HostCVT; + + // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg + CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 ); + + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), + Functor<CVT>(cv1) + ); + + HostCVT hcv1 = Kokkos::create_mirror_view( cv1 ); + Kokkos::deep_copy( hcv1, cv1 ); + + ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ; + } + + } + + } // end test_vcpt + +}; // end struct + +} // namespace + +TEST_F( TEST_CATEGORY , viewctorprop_embedded_dim ) { + TestViewCtorProp_EmbeddedDim< TEST_EXECSPACE >::test_vcpt( 2, 3 ); +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp index 6830c2e049bac5badc2ce8923026b25ba8e1f171..810ae72e7367b4d30a62736756e1cf88c2827509 100644 --- a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp +++ b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp @@ -56,24 +56,24 @@ void test_view_mapping() { typedef typename Space::execution_space ExecSpace; - typedef Kokkos::Experimental::Impl::ViewDimension<> dim_0; - typedef Kokkos::Experimental::Impl::ViewDimension< 2 > dim_s2; - typedef Kokkos::Experimental::Impl::ViewDimension< 2, 3 > dim_s2_s3; - typedef Kokkos::Experimental::Impl::ViewDimension< 2, 3, 4 > dim_s2_s3_s4; + typedef Kokkos::Impl::ViewDimension<> dim_0; + typedef Kokkos::Impl::ViewDimension< 2 > dim_s2; + typedef Kokkos::Impl::ViewDimension< 2, 3 > dim_s2_s3; + typedef Kokkos::Impl::ViewDimension< 2, 3, 4 > dim_s2_s3_s4; - typedef Kokkos::Experimental::Impl::ViewDimension< 0 > dim_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 3 > dim_s0_s3; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 3, 4 > dim_s0_s3_s4; + typedef Kokkos::Impl::ViewDimension< 0 > dim_s0; + typedef Kokkos::Impl::ViewDimension< 0, 3 > dim_s0_s3; + typedef Kokkos::Impl::ViewDimension< 0, 3, 4 > dim_s0_s3_s4; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0 > dim_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 4 > dim_s0_s0_s4; + typedef Kokkos::Impl::ViewDimension< 0, 0 > dim_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 4 > dim_s0_s0_s4; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0 > dim_s0_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0 > dim_s0_s0_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0 > dim_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0 > dim_s0_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0_s0; // Fully static dimensions should not be larger than an int. ASSERT_LE( sizeof( dim_0 ), sizeof( int ) ); @@ -186,12 +186,12 @@ void test_view_mapping() //---------------------------------------- - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s0, Kokkos::LayoutStride > stride_s0_s0_s0; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s0, Kokkos::LayoutStride > stride_s0_s0_s0; //---------------------------------------- // Static dimension. { - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutLeft > left_s2_s3_s4; + typedef Kokkos::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutLeft > left_s2_s3_s4; ASSERT_EQ( sizeof( left_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) ); @@ -223,7 +223,7 @@ void test_view_mapping() //---------------------------------------- // Small dimension is unpadded. { - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutLeft( 2, 3, 0, 0, 0, 0, 0, 0 ) ); @@ -275,7 +275,7 @@ void test_view_mapping() constexpr int N0 = 2000; constexpr int N1 = 300; - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) ); @@ -314,7 +314,7 @@ void test_view_mapping() //---------------------------------------- // Static dimension. { - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutRight > right_s2_s3_s4; + typedef Kokkos::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutRight > right_s2_s3_s4; ASSERT_EQ( sizeof( right_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) ); @@ -350,7 +350,7 @@ void test_view_mapping() //---------------------------------------- // Small dimension is unpadded. { - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutRight( 2, 3, 0, 0, 0, 0, 0, 0 ) ); @@ -391,7 +391,7 @@ void test_view_mapping() constexpr int N0 = 2000; constexpr int N1 = 300; - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) ); @@ -431,18 +431,18 @@ void test_view_mapping() // Subview. { // Mapping rank 4 to rank 3 - typedef Kokkos::Experimental::Impl::SubviewExtents< 4, 3 > SubviewExtents; + typedef Kokkos::Impl::SubviewExtents< 4, 3 > SubviewExtents; constexpr int N0 = 1000; constexpr int N1 = 2000; constexpr int N2 = 3000; constexpr int N3 = 4000; - Kokkos::Experimental::Impl::ViewDimension< N0, N1, N2, N3 > dim; + Kokkos::Impl::ViewDimension< N0, N1, N2, N3 > dim; SubviewExtents tmp( dim , N0 / 2 - , Kokkos::Experimental::ALL + , Kokkos::ALL , std::pair< int, int >( N2 / 4, 10 + N2 / 4 ) , Kokkos::pair< int, int >( N3 / 4, 20 + N3 / 4 ) ); @@ -469,12 +469,12 @@ void test_view_mapping() constexpr int sub_N1 = 200; constexpr int sub_N2 = 4; - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4; left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) ); - Kokkos::Experimental::Impl::SubviewExtents< 3, 3 > + Kokkos::Impl::SubviewExtents< 3, 3 > sub( dyn_off3.m_dim , Kokkos::pair< int, int >( 0, sub_N0 ) , Kokkos::pair< int, int >( 0, sub_N1 ) @@ -509,12 +509,12 @@ void test_view_mapping() constexpr int sub_N1 = 200; constexpr int sub_N2 = 4; - typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; + typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4; right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >() , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) ); - Kokkos::Experimental::Impl::SubviewExtents< 3, 3 > + Kokkos::Impl::SubviewExtents< 3, 3 > sub( dyn_off3.m_dim , Kokkos::pair< int, int >( 0, sub_N0 ) , Kokkos::pair< int, int >( 0, sub_N1 ) @@ -544,7 +544,7 @@ void test_view_mapping() //---------------------------------------- // View data analysis. { - using namespace Kokkos::Experimental::Impl; + using namespace Kokkos::Impl; static_assert( rank_dynamic<>::value == 0, "" ); static_assert( rank_dynamic< 1 >::value == 0, "" ); @@ -554,7 +554,7 @@ void test_view_mapping() } { - using namespace Kokkos::Experimental::Impl; + using namespace Kokkos::Impl; typedef ViewArrayAnalysis< int[] > a_int_r1; typedef ViewArrayAnalysis< int**[4][5][6] > a_int_r5; @@ -598,7 +598,7 @@ void test_view_mapping() } { - using namespace Kokkos::Experimental::Impl; + using namespace Kokkos::Impl; typedef int t_i4[4]; @@ -616,12 +616,12 @@ void test_view_mapping() } { - using namespace Kokkos::Experimental::Impl; + using namespace Kokkos::Impl; typedef ViewDataAnalysis< const int[], void > a_const_int_r1; static_assert( std::is_same< typename a_const_int_r1::specialize, void >::value, "" ); - static_assert( std::is_same< typename a_const_int_r1::dimension, Kokkos::Experimental::Impl::ViewDimension<0> >::value, "" ); + static_assert( std::is_same< typename a_const_int_r1::dimension, Kokkos::Impl::ViewDimension<0> >::value, "" ); static_assert( std::is_same< typename a_const_int_r1::type, const int * >::value, "" ); static_assert( std::is_same< typename a_const_int_r1::value_type, const int >::value, "" ); @@ -637,7 +637,7 @@ void test_view_mapping() static_assert( std::is_same< typename a_const_int_r3::specialize, void >::value, "" ); - static_assert( std::is_same< typename a_const_int_r3::dimension, Kokkos::Experimental::Impl::ViewDimension<0, 0, 4> >::value, "" ); + static_assert( std::is_same< typename a_const_int_r3::dimension, Kokkos::Impl::ViewDimension<0, 0, 4> >::value, "" ); static_assert( std::is_same< typename a_const_int_r3::type, const int**[4] >::value, "" ); static_assert( std::is_same< typename a_const_int_r3::value_type, const int >::value, "" ); @@ -786,7 +786,7 @@ void test_view_mapping() // The execution space of the memory space must be available for view data initialization. if ( std::is_same< ExecSpace, typename ExecSpace::memory_space::execution_space >::value ) { - using namespace Kokkos::Experimental; + using namespace Kokkos; typedef typename ExecSpace::memory_space memory_space; typedef View< int*, memory_space > V; @@ -811,8 +811,8 @@ void test_view_mapping() { typedef Kokkos::ViewTraits< int***, Kokkos::LayoutStride, ExecSpace > traits_t; - typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0 > dims_t; - typedef Kokkos::Experimental::Impl::ViewOffset< dims_t, Kokkos::LayoutStride > offset_t; + typedef Kokkos::Impl::ViewDimension< 0, 0, 0 > dims_t; + typedef Kokkos::Impl::ViewOffset< dims_t, Kokkos::LayoutStride > offset_t; Kokkos::LayoutStride stride; @@ -836,8 +836,8 @@ void test_view_mapping() ASSERT_EQ( offset.span(), 60 ); ASSERT_TRUE( offset.span_is_contiguous() ); - Kokkos::Experimental::Impl::ViewMapping< traits_t, void > - v( Kokkos::Experimental::Impl::ViewCtorProp< int* >( (int*) 0 ), stride ); + Kokkos::Impl::ViewMapping< traits_t, void > + v( Kokkos::Impl::ViewCtorProp< int* >( (int*) 0 ), stride ); } { @@ -849,8 +849,8 @@ void test_view_mapping() constexpr int N1 = 11; V a( "a", N0, N1 ); - M b = Kokkos::Experimental::create_mirror( a ); - M c = Kokkos::Experimental::create_mirror_view( a ); + M b = Kokkos::create_mirror( a ); + M c = Kokkos::create_mirror_view( a ); M d; for ( int i0 = 0; i0 < N0; ++i0 ) @@ -859,8 +859,8 @@ void test_view_mapping() b( i0, i1 ) = 1 + i0 + i1 * N0; } - Kokkos::Experimental::deep_copy( a, b ); - Kokkos::Experimental::deep_copy( c, a ); + Kokkos::deep_copy( a, b ); + Kokkos::deep_copy( c, a ); for ( int i0 = 0; i0 < N0; ++i0 ) for ( int i1 = 0; i1 < N1; ++i1 ) @@ -868,7 +868,7 @@ void test_view_mapping() ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) ); } - Kokkos::Experimental::resize( b, 5, 6 ); + Kokkos::resize( b, 5, 6 ); for ( int i0 = 0; i0 < 5; ++i0 ) for ( int i1 = 0; i1 < 6; ++i1 ) @@ -878,8 +878,8 @@ void test_view_mapping() ASSERT_EQ( b( i0, i1 ), val ); } - Kokkos::Experimental::realloc( c, 5, 6 ); - Kokkos::Experimental::realloc( d, 5, 6 ); + Kokkos::realloc( c, 5, 6 ); + Kokkos::realloc( d, 5, 6 ); ASSERT_EQ( b.dimension_0(), 5 ); ASSERT_EQ( b.dimension_1(), 6 ); @@ -889,7 +889,7 @@ void test_view_mapping() ASSERT_EQ( d.dimension_1(), 6 ); layout_type layout( 7, 8 ); - Kokkos::Experimental::resize( b, layout ); + Kokkos::resize( b, layout ); for ( int i0 = 0; i0 < 7; ++i0 ) for ( int i1 = 6; i1 < 8; ++i1 ) { @@ -909,8 +909,8 @@ void test_view_mapping() ASSERT_EQ( b( i0, i1 ), val ); } - Kokkos::Experimental::realloc( c, layout ); - Kokkos::Experimental::realloc( d, layout ); + Kokkos::realloc( c, layout ); + Kokkos::realloc( d, layout ); ASSERT_EQ( b.dimension_0(), 7 ); ASSERT_EQ( b.dimension_1(), 8 ); @@ -932,8 +932,8 @@ void test_view_mapping() const int order[] = { 1, 0 }; V a( "a", Kokkos::LayoutStride::order_dimensions( 2, order, dimensions ) ); - M b = Kokkos::Experimental::create_mirror( a ); - M c = Kokkos::Experimental::create_mirror_view( a ); + M b = Kokkos::create_mirror( a ); + M c = Kokkos::create_mirror_view( a ); M d; for ( int i0 = 0; i0 < N0; ++i0 ) @@ -942,8 +942,8 @@ void test_view_mapping() b( i0, i1 ) = 1 + i0 + i1 * N0; } - Kokkos::Experimental::deep_copy( a, b ); - Kokkos::Experimental::deep_copy( c, a ); + Kokkos::deep_copy( a, b ); + Kokkos::deep_copy( c, a ); for ( int i0 = 0; i0 < N0; ++i0 ) for ( int i1 = 0; i1 < N1; ++i1 ) @@ -954,7 +954,7 @@ void test_view_mapping() const int dimensions2[] = { 7, 8 }; const int order2[] = { 1, 0 }; layout_type layout = layout_type::order_dimensions( 2, order2, dimensions2 ); - Kokkos::Experimental::resize( b, layout ); + Kokkos::resize( b, layout ); for ( int i0 = 0; i0 < 7; ++i0 ) for ( int i1 = 0; i1 < 8; ++i1 ) @@ -964,8 +964,8 @@ void test_view_mapping() ASSERT_EQ( b( i0, i1 ), val ); } - Kokkos::Experimental::realloc( c, layout ); - Kokkos::Experimental::realloc( d, layout ); + Kokkos::realloc( c, layout ); + Kokkos::realloc( d, layout ); ASSERT_EQ( b.dimension_0(), 7 ); ASSERT_EQ( b.dimension_1(), 8 ); diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp index e3a12e684e8b45688bf831ebd92f4d7074e236d2..106323492a98902789dfeb47937ab53fcc9a0397 100644 --- a/lib/kokkos/core/unit_test/TestViewSubview.hpp +++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp @@ -915,134 +915,134 @@ void test_3d_subview_5d_impl_layout() { inline void test_subview_legal_args_right() { - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) ); - - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); - - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); - - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); - - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) ); - - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); - - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) ); + + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); + + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); + + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); + + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) ); + + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); + + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); } inline void test_subview_legal_args_left() { - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) ); - - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); - - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); - - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); - - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) ); - - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); - - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); - ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) ); + + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); + + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) ); + + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) ); + + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) ); + + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); + + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) ); + ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) ); } } // namespace Impl diff --git a/lib/kokkos/core/unit_test/TestWorkGraph.hpp b/lib/kokkos/core/unit_test/TestWorkGraph.hpp new file mode 100644 index 0000000000000000000000000000000000000000..70cf6b47c054ce56e488de6f9821243183e8580b --- /dev/null +++ b/lib/kokkos/core/unit_test/TestWorkGraph.hpp @@ -0,0 +1,172 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <vector> +#include <iostream> + +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace { + +/* This test is meant to be the WorkGraph equivalent of the Task DAG Scheduler test, + please see TestTaskScheduler.hpp for that test. + The algorithm computes the N-th fibonacci number as follows: + - Each "task" or "work item" computes the i-th fibonacci number + - If a task as (i < 2), it will record the known answer ahead of time. + - If a taks has (i >= 2), it will "spawn" two more tasks to compute + the (i - 1) and (i - 2) fibonacci numbers. + We do NOT do any de-duplication of these tasks. + De-duplication would result in only (N - 2) tasks which must be run in serial. + We allow duplicates both to increase the number of tasks and to increase the + amount of available parallelism. + */ + +template< class ExecSpace > +struct TestWorkGraph { + + using MemorySpace = typename ExecSpace::memory_space; + using Policy = Kokkos::Experimental::WorkGraphPolicy<std::int32_t, ExecSpace>; + using Graph = typename Policy::graph_type; + using RowMap = typename Graph::row_map_type; + using Entries = typename Graph::entries_type; + using Values = Kokkos::View<long*, MemorySpace>; + + long m_input; + Graph m_graph; + Graph m_transpose; + Values m_values; + + TestWorkGraph(long arg_input):m_input(arg_input) { + form_graph(); + transpose_crs(m_transpose, m_graph); + } + + inline + long full_fibonacci( long n ) { + constexpr long mask = 0x03; + long fib[4] = { 0, 1, 1, 2 }; + for ( long i = 2; i <= n; ++i ) { + fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ]; + } + return fib[ n & mask ]; + } + + struct HostEntry { + long input; + std::int32_t parent; + }; + std::vector<HostEntry> form_host_graph() { + std::vector<HostEntry> g; + g.push_back({ m_input , -1 }); + for (std::int32_t i = 0; i < std::int32_t(g.size()); ++i) { + auto e = g.at(std::size_t(i)); + if (e.input < 2) continue; + /* This part of the host graph formation is the equivalent of task spawning + in the Task DAG system. Notice how each task which is not a base case + spawns two more tasks, without any de-duplication */ + g.push_back({ e.input - 1, i }); + g.push_back({ e.input - 2, i }); + } + return g; + } + + void form_graph() { + auto hg = form_host_graph(); + m_graph.row_map = RowMap("row_map", hg.size() + 1); // row map always has one more + m_graph.entries = Entries("entries", hg.size() - 1); // all but the first have a parent + m_values = Values("values", hg.size()); + auto h_row_map = Kokkos::create_mirror_view(m_graph.row_map); + auto h_entries = Kokkos::create_mirror_view(m_graph.entries); + auto h_values = Kokkos::create_mirror_view(m_values); + h_row_map(0) = 0; + for (std::int32_t i = 0; i < std::int32_t(hg.size()); ++i) { + auto& e = hg.at(std::size_t(i)); + h_row_map(i + 1) = i; + if (e.input < 2) { + h_values(i) = e.input; + } + if (e.parent == -1) continue; + h_entries(i - 1) = e.parent; + } + Kokkos::deep_copy(m_graph.row_map, h_row_map); + Kokkos::deep_copy(m_graph.entries, h_entries); + Kokkos::deep_copy(m_values, h_values); + } + + KOKKOS_INLINE_FUNCTION + void operator()(std::int32_t i) const { + auto begin = m_transpose.row_map(i); + auto end = m_transpose.row_map(i + 1); + for (auto j = begin; j < end; ++j) { + auto k = m_transpose.entries(j); + m_values(i) += m_values( k ); + } + } + + void test_for() { + Kokkos::parallel_for(Policy(m_graph), *this); + auto h_values = Kokkos::create_mirror_view(m_values); + Kokkos::deep_copy(h_values, m_values); + ASSERT_EQ( h_values(0), full_fibonacci(m_input) ); + } + +}; + +} // anonymous namespace + +TEST_F( TEST_CATEGORY, DISABLED_workgraph_fib ) +{ + #ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND + int limit = 15; + #else + int limit = 27; + #endif + for ( int i = 0; i < limit; ++i) { + TestWorkGraph< TEST_EXECSPACE > f(i); + f.test_for(); + } +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/UnitTestMain.cpp b/lib/kokkos/core/unit_test/UnitTestMain.cpp index 4f52fc956707147761dd60354d9cade69b37bb9a..a7dc7c4973c4930239b3e0689ed85435ff80c5f1 100644 --- a/lib/kokkos/core/unit_test/UnitTestMain.cpp +++ b/lib/kokkos/core/unit_test/UnitTestMain.cpp @@ -42,6 +42,7 @@ */ #include <gtest/gtest.h> +#include <cstdlib> int main( int argc, char *argv[] ) { ::testing::InitGoogleTest( &argc, argv ); diff --git a/lib/kokkos/core/unit_test/UnitTestMainInit.cpp b/lib/kokkos/core/unit_test/UnitTestMainInit.cpp index 21f851274b81c3c90f3d83aea82a2bd10f08aafd..62a01e9033eae9ca9b1f4f260d92efeda66ff8b2 100644 --- a/lib/kokkos/core/unit_test/UnitTestMainInit.cpp +++ b/lib/kokkos/core/unit_test/UnitTestMainInit.cpp @@ -42,6 +42,8 @@ */ #include <gtest/gtest.h> +#include <cstdlib> + #include <Kokkos_Core.hpp> int main( int argc, char *argv[] ) { diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp index ba06b71192061191ea71745795ba93f1fd5ae84e..fa6722615c875b6a026e0f992ad8290f2c2c0058 100644 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp @@ -48,3 +48,5 @@ #include<TestMemoryPool.hpp> #include<TestCXX11.hpp> #include<TestTile.hpp> + +#include<TestViewCtorPropEmbeddedDim.hpp> diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8424ae10d64b4df809ae185f37a7bdfe79fd78f5 --- /dev/null +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include<cuda/TestCuda_Category.hpp> +#include<TestUniqueToken.hpp> + diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..663ca1d560ce66b9d6b887625c616e439f281443 --- /dev/null +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp @@ -0,0 +1,45 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include<cuda/TestCuda_Category.hpp> +#include<TestWorkGraph.hpp> diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c02905535ba0e638a2d358b612e0d035114b3784 --- /dev/null +++ b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> +#include "TestResize.hpp" + +namespace Test { + +TEST( kokkosresize, host_space_access ) +{ + // Test with the default device type. + using TestViewResize::testResize; + typedef Kokkos::View<int*>::device_type device_type; + testResize<device_type> (); +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp index 2f8daf7ad74a6c8c0533aeec27cd6d0370969a40..c12574a65a6e6ef66687cd8b0d39e7a343017a46 100644 --- a/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp +++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp @@ -86,25 +86,26 @@ class openmp : public ::testing::Test { protected: static void SetUpTestCase() { - const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); + int threads_count = 0; + #pragma omp parallel + { + #pragma omp atomic + ++threads_count; + } - const unsigned threads_count = std::max( 1u, numa_count ) * - std::max( 2u, ( cores_per_numa * threads_per_core ) / 2 ); + if (threads_count > 3) { + threads_count /= 2; + } Kokkos::OpenMP::initialize( threads_count ); Kokkos::print_configuration( std::cout, true ); + srand( 10231 ); } static void TearDownTestCase() { Kokkos::OpenMP::finalize(); - - omp_set_num_threads( 1 ); - - ASSERT_EQ( 1, omp_get_max_threads() ); } }; diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp index 5e9535638d59c289b7eb18dedbf48639e9f9722b..33e7402ce65704d90a628183d50f952e559dead9 100644 --- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp +++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp @@ -48,3 +48,93 @@ #include<TestMemoryPool.hpp> #include<TestCXX11.hpp> #include<TestTile.hpp> + +#include<TestViewCtorPropEmbeddedDim.hpp> + +#include <mutex> + +namespace Test { + +TEST_F( openmp, partition_master ) +{ + using Mutex = Kokkos::Experimental::MasterLock<Kokkos::OpenMP>; + + Mutex mtx; + int errors = 0; + + auto master = [&errors, &mtx](int partition_id, int num_partitions) { + + const int pool_size = Kokkos::OpenMP::thread_pool_size(); + + { + std::unique_lock<Mutex> lock(mtx); + if ( Kokkos::OpenMP::in_parallel() ) { + ++errors; + } + if ( Kokkos::OpenMP::thread_pool_rank() != 0 ) { + ++errors; + } + } + + { + int local_errors = 0; + Kokkos::parallel_reduce( Kokkos::RangePolicy<Kokkos::OpenMP>(0,1000) + , [pool_size]( const int , int & errs ) { + if ( Kokkos::OpenMP::thread_pool_size() != pool_size ) { + ++errs; + } + } + , local_errors + ); + Kokkos::atomic_add( &errors, local_errors ); + } + + Kokkos::Experimental::UniqueToken< Kokkos::OpenMP > token; + + Kokkos::View<int*, Kokkos::OpenMP> count( "", token.size() ); + + Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::OpenMP>(0,1000), + [=] ( const int ) { + int i = token.acquire(); + ++count[i]; + token.release(i); + }); + + Kokkos::View<int,Kokkos::OpenMP> sum (""); + Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::OpenMP>(0,token.size()), + [=] ( const int i ) { + Kokkos::atomic_add( sum.data(), count[i] ); + }); + + if (sum() != 1000) { + Kokkos::atomic_add( &errors, 1 ); + } + }; + + master(0,1); + + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 4, 0 ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 0, 4 ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 2, 2 ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 8, 0 ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 0, 8 ); + ASSERT_EQ( errors, 0 ); + + Kokkos::OpenMP::partition_master( master, 8, 8 ); + ASSERT_EQ( errors, 0 ); +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp new file mode 100644 index 0000000000000000000000000000000000000000..143a6d99104607ad2bb31d79074ead5670c81177 --- /dev/null +++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include<openmp/TestOpenMP_Category.hpp> +#include<TestUniqueToken.hpp> + diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ec6fa1653c051d03ac92ee1436041f0b587fee9a --- /dev/null +++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp @@ -0,0 +1,45 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include<openmp/TestOpenMP_Category.hpp> +#include<TestWorkGraph.hpp> diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp index a6a76a03bd3b82c714f916f76ba5e48758ce540c..bc39b1e16088c3c2933aa28d792f39226fe11e9c 100644 --- a/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp +++ b/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp @@ -48,3 +48,5 @@ #include<TestMemoryPool.hpp> #include<TestCXX11.hpp> #include<TestTile.hpp> + +#include<TestViewCtorPropEmbeddedDim.hpp> diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..de1638de5ed8a1e4f9ce3af8a50e5a2af9feab5c --- /dev/null +++ b/lib/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp @@ -0,0 +1,45 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include<serial/TestSerial_Category.hpp> +#include<TestWorkGraph.hpp> diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp index c11155c5c0c1d55dcc580faa12bcd3646d5d55db..160b37a2c8c8c9c737b8bfac6e87887db55ece7d 100644 --- a/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp +++ b/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp @@ -48,3 +48,5 @@ #include<TestMemoryPool.hpp> #include<TestCXX11.hpp> #include<TestTile.hpp> + +#include<TestViewCtorPropEmbeddedDim.hpp> diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6b7dbb26db222624a3efe26eb5c718da50a51ce7 --- /dev/null +++ b/lib/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp @@ -0,0 +1,45 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include<threads/TestThreads_Category.hpp> +#include<TestWorkGraph.hpp> diff --git a/lib/kokkos/example/cmake_build/CMakeLists.txt b/lib/kokkos/example/cmake_build/CMakeLists.txt index 4e149726ee780f961c386eb49de28d4fb18284a0..f92c5c6513f99a8e2084ee11a75d6317ae760131 100644 --- a/lib/kokkos/example/cmake_build/CMakeLists.txt +++ b/lib/kokkos/example/cmake_build/CMakeLists.txt @@ -40,5 +40,7 @@ list(APPEND CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -O3) add_subdirectory(${Example_SOURCE_DIR}/../.. ${Example_BINARY_DIR}/kokkos) +include_directories(${Kokkos_INCLUDE_DIRS_RET}) + add_executable(example cmake_example.cpp) target_link_libraries(example kokkos) diff --git a/lib/kokkos/example/feint/main.cpp b/lib/kokkos/example/feint/main.cpp index 616e584bf68fb0f1a4e935aeb43b965ce8d04221..57a8f8fafb366fd11fba7b6a4f250f0c6f2559f3 100644 --- a/lib/kokkos/example/feint/main.cpp +++ b/lib/kokkos/example/feint/main.cpp @@ -69,12 +69,26 @@ int main() #if defined( KOKKOS_ENABLE_OPENMP ) { - // Use 4 cores per NUMA region, unless fewer available - - const unsigned use_numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() ); - Kokkos::OpenMP::initialize( use_numa_count * use_cores_per_numa ); + int num_threads = 0; + if ( Kokkos::hwloc::available() ) { + // Use 4 cores per NUMA region, unless fewer available + const unsigned use_numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() ); + num_threads = use_numa_count * use_cores_per_numa; + + } + else { + #pragma omp parallel + { + #pragma omp atomic + ++num_threads; + } + num_threads = std::max(4, num_threads/4); + } + + + Kokkos::OpenMP::initialize( num_threads ); std::cout << "feint< OpenMP , NotUsingAtomic >" << std::endl ; Kokkos::Example::feint< Kokkos::OpenMP , false >(); diff --git a/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp b/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp index fb33aef56e8f9eb5e5bd5beb7532d799efaef794..b6b8b2f5e0322f2a50ac1fb0d23f84bd5419a6f3 100644 --- a/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp +++ b/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp @@ -138,7 +138,16 @@ int main(int argc, char *argv[]) #endif #ifdef KOKKOS_ENABLE_OPENMP - Kokkos::OpenMP::initialize( threads_count ); + int num_threads = 0; + #pragma omp parallel + { + #pragma omp atomic + ++num_threads; + } + if( num_threads > 3 ) { + num_threads = std::max(4, num_threads/4); + } + Kokkos::OpenMP::initialize( num_threads ); num_errors += G2L::run_openmp(num_ids,num_find_iterations); Kokkos::OpenMP::finalize(); #endif diff --git a/lib/kokkos/example/grow_array/main.cpp b/lib/kokkos/example/grow_array/main.cpp index e7438a9bf4e7429b658c5834eece03aeb2f5467e..3f1d534d933ec3a90f921ed73404cb132bc0e46c 100644 --- a/lib/kokkos/example/grow_array/main.cpp +++ b/lib/kokkos/example/grow_array/main.cpp @@ -88,7 +88,7 @@ int main( int argc , char ** argv ) #if defined( KOKKOS_ENABLE_OPENMP ) { std::cout << "Kokkos::OpenMP" << std::endl ; - Kokkos::OpenMP::initialize( num_threads , use_numa , use_core ); + Kokkos::OpenMP::initialize(); Example::grow_array< Kokkos::OpenMP >( length_array , span_values ); Kokkos::OpenMP::finalize(); } diff --git a/lib/kokkos/example/tutorial/03_simple_view/Makefile b/lib/kokkos/example/tutorial/03_simple_view/Makefile index e716b765e7f1778d839f2dcd603d258d2287c8fe..32483a255585aa85c058c671063d674da062315b 100644 --- a/lib/kokkos/example/tutorial/03_simple_view/Makefile +++ b/lib/kokkos/example/tutorial/03_simple_view/Makefile @@ -33,6 +33,7 @@ include $(KOKKOS_PATH)/Makefile.kokkos build: $(EXE) +#for unit testing only, for best preformance with OpenMP 4.0 or better test: $(EXE) ./$(EXE) diff --git a/lib/kokkos/example/tutorial/Advanced_Views/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/Makefile index bc4012f68cfa22fcf0c9ac074391f26bd7a149d8..12ac5652e5798c11f2285e4294fcc88ce771093e 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/Makefile @@ -22,100 +22,102 @@ endif build: mkdir -p 01_data_layouts cd ./01_data_layouts; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} mkdir -p 02_memory_traits cd ./02_memory_traits; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} mkdir -p 03_subviews cd ./03_subviews; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} mkdir -p 04_dualviews cd ./04_dualviews; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} mkdir -p 05_NVIDIA_UVM cd ./05_NVIDIA_UVM; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} #mkdir -p 06_AtomicViews #cd ./06_AtomicViews; \ - #make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} #mkdir -p 07_Overlapping_DeepCopy #cd ./07_Overlapping_DeepCopy; \ - #make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} build-insource: cd ./01_data_layouts; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./02_memory_traits; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./03_subviews; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./04_dualviews; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./05_NVIDIA_UVM; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} #cd ./06_AtomicViews; \ - #make build -j 4 ${KOKKOS_SETTINGS} + #$(MAKE) build ${KOKKOS_SETTINGS} #cd ./07_Overlapping_DeepCopy; \ - #make build -j 4 ${KOKKOS_SETTINGS} + #$(MAKE) build ${KOKKOS_SETTINGS} + test: cd ./01_data_layouts; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} cd ./02_memory_traits; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} cd ./03_subviews; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} cd ./04_dualviews; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} cd ./05_NVIDIA_UVM; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} #cd ./06_AtomicViews; \ - #make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} #cd ./07_Overlapping_DeepCopy; \ - #make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} test-insource: cd ./01_data_layouts; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./02_memory_traits; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./03_subviews; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./04_dualviews; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./05_NVIDIA_UVM; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} #cd ./06_AtomicViews; \ - #make test -j 4 ${KOKKOS_SETTINGS} + #$(MAKE) test ${KOKKOS_SETTINGS} #cd ./07_Overlapping_DeepCopy; \ - #make test -j 4 ${KOKKOS_SETTINGS} + #$(MAKE) test ${KOKKOS_SETTINGS} + clean: cd ./01_data_layouts; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} cd ./02_memory_traits; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} cd ./03_subviews; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} cd ./04_dualviews; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} cd ./05_NVIDIA_UVM; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} #cd ./06_AtomicViews; \ - #make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} #cd ./07_Overlapping_DeepCopy; \ - #make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} + #$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} clean-insource: cd ./01_data_layouts; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./02_memory_traits; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./03_subviews; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./04_dualviews; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./05_NVIDIA_UVM; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} #cd ./06_AtomicViews; \ - #make clean ${KOKKOS_SETTINGS} + #$(MAKE) clean ${KOKKOS_SETTINGS} #cd ./07_Overlapping_DeepCopy; \ - #make clean ${KOKKOS_SETTINGS} + #$(MAKE) clean ${KOKKOS_SETTINGS} diff --git a/lib/kokkos/example/tutorial/Algorithms/Makefile b/lib/kokkos/example/tutorial/Algorithms/Makefile index ad0b76f9d66f4e3f35f5f1dc329b976c2603353e..4e70ba7d976fe5e364049bd46eae3b7f2c9b1153 100644 --- a/lib/kokkos/example/tutorial/Algorithms/Makefile +++ b/lib/kokkos/example/tutorial/Algorithms/Makefile @@ -22,22 +22,22 @@ endif build: mkdir -p 01_random_numbers cd ./01_random_numbers; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} build-insource: cd ./01_random_numbers; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} test: cd ./01_random_numbers; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} test-insource: cd ./01_random_numbers; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} clean: cd ./01_random_numbers; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} clean-insource: cd ./01_random_numbers; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile index 44fdf90f8a837da174b96fcb9032b3e47920390f..4bf6d487ae977ca6bd42e9f5787314bf4fd8bbe7 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile @@ -22,74 +22,74 @@ endif build: mkdir -p 01_thread_teams cd ./01_thread_teams; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} mkdir -p 01_thread_teams_lambda cd ./01_thread_teams_lambda; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} mkdir -p 02_nested_parallel_for cd ./02_nested_parallel_for; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} mkdir -p 03_vectorization cd ./03_vectorization; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} mkdir -p 04_team_scan cd ./04_team_scan; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} build-insource: cd ./01_thread_teams; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./01_thread_teams_lambda; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./02_nested_parallel_for; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./03_vectorization; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./04_team_scan; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} test: cd ./01_thread_teams; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} cd ./01_thread_teams_lambda; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} cd ./02_nested_parallel_for; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} cd ./03_vectorization; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} cd ./04_team_scan; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} test-insource: cd ./01_thread_teams; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./01_thread_teams_lambda; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./02_nested_parallel_for; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./03_vectorization; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./04_team_scan; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} clean: cd ./01_thread_teams; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} cd ./01_thread_teams_lambda; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} cd ./02_nested_parallel_for; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} cd ./03_vectorization; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} cd ./04_team_scan; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} clean-insource: cd ./01_thread_teams; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./01_thread_teams_lambda; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./02_nested_parallel_for; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./03_vectorization; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./04_team_scan; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} diff --git a/lib/kokkos/example/tutorial/Makefile b/lib/kokkos/example/tutorial/Makefile index 063ace8aabbe6017611ac17a54d12a47cb7e3196..7b2732eeedc2c91f5648aeacfb2aa27817e1fae0 100644 --- a/lib/kokkos/example/tutorial/Makefile +++ b/lib/kokkos/example/tutorial/Makefile @@ -23,152 +23,152 @@ endif build: mkdir -p 01_hello_world cd ./01_hello_world; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} mkdir -p 01_hello_world_lambda cd ./01_hello_world_lambda; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} mkdir -p 02_simple_reduce cd ./02_simple_reduce; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} mkdir -p 02_simple_reduce_lambda cd ./02_simple_reduce_lambda; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} mkdir -p 03_simple_view cd ./03_simple_view; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} mkdir -p 03_simple_view_lambda cd ./03_simple_view_lambda; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} mkdir -p 04_simple_memoryspaces cd ./04_simple_memoryspaces; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} mkdir -p 05_simple_atomics cd ./05_simple_atomics; \ - make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} mkdir -p Advanced_Views cd ./Advanced_Views; \ - make build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' mkdir -p Algorithms cd ./Algorithms; \ - make build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' mkdir -p Hierarchical_Parallelism cd ./Hierarchical_Parallelism; \ - make build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' build-insource: cd ./01_hello_world; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./01_hello_world_lambda; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./02_simple_reduce; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./02_simple_reduce_lambda; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./03_simple_view; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./03_simple_view_lambda; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./04_simple_memoryspaces; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./05_simple_atomics; \ - make build -j 4 ${KOKKOS_SETTINGS} + $(MAKE) build ${KOKKOS_SETTINGS} cd ./Advanced_Views; \ - make build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Algorithms; \ - make build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Hierarchical_Parallelism; \ - make build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' test: cd ./01_hello_world; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} cd ./01_hello_world_lambda; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} cd ./02_simple_reduce; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} cd ./02_simple_reduce_lambda; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} cd ./03_simple_view; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} cd ./03_simple_view_lambda; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} cd ./04_simple_memoryspaces; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} cd ./05_simple_atomics; \ - make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} cd ./Advanced_Views; \ - make test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Algorithms; \ - make test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Hierarchical_Parallelism; \ - make test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' test-insource: cd ./01_hello_world; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./01_hello_world_lambda; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./02_simple_reduce; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./02_simple_reduce_lambda; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./03_simple_view; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./03_simple_view_lambda; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./04_simple_memoryspaces; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./05_simple_atomics; \ - make test -j 4 ${KOKKOS_SETTINGS} + $(MAKE) test ${KOKKOS_SETTINGS} cd ./Advanced_Views; \ - make test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Algorithms; \ - make test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Hierarchical_Parallelism; \ - make test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' clean: cd ./01_hello_world; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} cd ./01_hello_world_lambda; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} cd ./02_simple_reduce; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} cd ./02_simple_reduce_lambda; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} cd ./03_simple_view; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} cd ./03_simple_view_lambda; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} cd ./04_simple_memoryspaces; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} cd ./05_simple_atomics; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} cd ./Advanced_Views; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Algorithms; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Hierarchical_Parallelism; \ - make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' clean-insource: cd ./01_hello_world; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./01_hello_world_lambda; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./02_simple_reduce; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./02_simple_reduce_lambda; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./03_simple_view; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./03_simple_view_lambda; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./04_simple_memoryspaces; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./05_simple_atomics; \ - make clean ${KOKKOS_SETTINGS} + $(MAKE) clean ${KOKKOS_SETTINGS} cd ./Advanced_Views; \ - make clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Algorithms; \ - make clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' cd ./Hierarchical_Parallelism; \ - make clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + $(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' diff --git a/lib/kokkos/example/tutorial/launch_bounds/CMakeLists.txt b/lib/kokkos/example/tutorial/launch_bounds/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c78db840f849fd9625676c6a73e8aa037b52b4d --- /dev/null +++ b/lib/kokkos/example/tutorial/launch_bounds/CMakeLists.txt @@ -0,0 +1,10 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +TRIBITS_ADD_EXECUTABLE( + tutorial_02_simple_reduce + SOURCES simple_reduce.cpp + COMM serial mpi + ) diff --git a/lib/kokkos/example/tutorial/launch_bounds/Makefile b/lib/kokkos/example/tutorial/launch_bounds/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..5b605a4119d32f974e1bdeaa3b97316b3ea607de --- /dev/null +++ b/lib/kokkos/example/tutorial/launch_bounds/Makefile @@ -0,0 +1,56 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/launch_bounds/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LINKFLAGS = +EXE = launch_bounds.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LINKFLAGS = +EXE = launch_bounds.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +# WAR for "undefined memcpy" w/ Ubuntu + CUDA 7.5 +CXXFLAGS += -D_FORCE_INLINES +# Additional compile-time information +CXXFLAGS += -Xptxas=-v + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +temp: + echo $(KOKKOS_INTERNAL_USE_CUDA) $(CUDA_PATH) + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp b/lib/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9a26eda5073a248a1360c489bfe1d64c889857d3 --- /dev/null +++ b/lib/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp @@ -0,0 +1,173 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +// +// First reduction (parallel_reduce) example: +// 1. Start up Kokkos +// 2. Execute a parallel_reduce loop in the default execution space, +// using a functor to define the loop body +// 3. Shut down Kokkos +// +struct collision { +// Reduction functor +// For each i, we generate 10 hashes, look for and count collisions +// We use parallel_reduce to count the total collisions +// Note that we're just counting collisions within the 10 generated +// one i. +// This function was chosen as one that very simply can increase the +// register count. + typedef int value_type; + + KOKKOS_INLINE_FUNCTION + int hash(int q) const { + // A simple hash by Justin Sobel + // Thanks to Arash Partow (partow.net) + char* fourchars = (char*)&q; + int hash = 1315423911; + for (int i=0; i<4; fourchars++, i++) { + hash ^= ((hash<<5) + *fourchars + (hash >> 2)); + } + return hash; + } + + KOKKOS_INLINE_FUNCTION + void operator () (const int i, int& lsum) const { + //This is a silly function which generates 10 hashes + // then checks for collisions + int a = hash(i)%64; + int b = hash(i*3)%64; + int c = hash(i*5)%64; + int d = hash(i*7)%64; + int e = hash(i*11)%64; + int f = hash(i*17)%64; + int g = hash(i*23)%64; + int h = hash(i*29)%64; + int j = hash(i*31)%64; + int k = hash(i*37)%64; + + + if (a==b) lsum++; + if (a==c) lsum++; + if (a==d) lsum++; + if (a==e) lsum++; + if (a==f) lsum++; + if (a==g) lsum++; + if (a==h) lsum++; + if (a==j) lsum++; + if (a==k) lsum++; + if (b==c) lsum++; + if (b==d) lsum++; + if (b==e) lsum++; + if (b==f) lsum++; + if (b==g) lsum++; + if (b==h) lsum++; + if (b==j) lsum++; + if (b==k) lsum++; + if (c==d) lsum++; + if (c==e) lsum++; + if (c==f) lsum++; + if (c==g) lsum++; + if (c==h) lsum++; + if (c==j) lsum++; + if (c==k) lsum++; + if (d==e) lsum++; + if (d==f) lsum++; + if (d==g) lsum++; + if (d==h) lsum++; + if (d==j) lsum++; + if (d==k) lsum++; + if (e==f) lsum++; + if (e==g) lsum++; + if (e==h) lsum++; + if (e==j) lsum++; + if (e==k) lsum++; + if (f==g) lsum++; + if (f==h) lsum++; + if (f==j) lsum++; + if (f==k) lsum++; + if (g==h) lsum++; + if (g==j) lsum++; + if (g==k) lsum++; + if (h==j) lsum++; + if (h==k) lsum++; + if (j==k) lsum++; + } + + + +}; + +int main (int argc, char* argv[]) { + Kokkos::initialize (argc, argv); + const int n = 10000; + + // Compute and count hash collisions in + // parallel, using Kokkos. + // This is not really a useful algorithm, but it demonstrates the + // LaunchBounds functionality + int sum1 = 0; + int sum2 = 0; + + //Without LaunchBounds, the kernel uses 56 registers + Kokkos::parallel_reduce (n, collision (), sum1); + + //With LaunchBounds, we can reduce the register usage to 32 + Kokkos::parallel_reduce (Kokkos::RangePolicy<Kokkos::LaunchBounds<512,4>>(0,n), collision (), sum2); + + printf ("Number of collisions, " + "computed in parallel, is %i\n", sum1); + + if (sum1 != sum2) { + printf( "Uh-oh! Results do not match\n"); + return -1; + } + + Kokkos::finalize(); + + + return 0; +} + diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash index 5f2442102d0117a28cea52183e74d3fca08cc3c8..6d636dc7e4fc0e8225556bca214b0984f2c43bca 100755 --- a/lib/kokkos/generate_makefile.bash +++ b/lib/kokkos/generate_makefile.bash @@ -1,7 +1,6 @@ #!/bin/bash KOKKOS_DEVICES="" -MAKE_J_OPTION="32" KOKKOS_DO_EXAMPLES="1" @@ -70,7 +69,8 @@ do KOKKOS_DEBUG=yes ;; --make-j*) - MAKE_J_OPTION="${key#*=}" + echo "Warning: ${key} is deprecated" + echo "Call make with appropriate -j flag" ;; --no-examples) KOKKOS_DO_EXAMPLES="0" @@ -110,23 +110,34 @@ do echo "--with-devices: Explicitly add a set of backends." echo "" echo "--arch=[OPT]: Set target architectures. Options are:" + echo " [AMD]" + echo " AMDAVX = AMD CPU" + echo " [ARM]" echo " ARMv80 = ARMv8.0 Compatible CPU" echo " ARMv81 = ARMv8.1 Compatible CPU" echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" + echo " [IBM]" + echo " Power8 = IBM POWER8 CPUs" + echo " Power9 = IBM POWER9 CPUs" + echo " [Intel]" + echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" echo " SKX = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)" + echo " [Intel Xeon Phi]" echo " KNC = Intel Knights Corner Xeon Phi" echo " KNL = Intel Knights Landing Xeon Phi" + echo " [NVIDIA]" echo " Kepler30 = NVIDIA Kepler generation CC 3.0" + echo " Kepler32 = NVIDIA Kepler generation CC 3.2" echo " Kepler35 = NVIDIA Kepler generation CC 3.5" echo " Kepler37 = NVIDIA Kepler generation CC 3.7" + echo " Maxwell50 = NVIDIA Maxwell generation CC 5.0" + echo " Maxwell52 = NVIDIA Maxwell generation CC 5.2" + echo " Maxwell53 = NVIDIA Maxwell generation CC 5.3" echo " Pascal60 = NVIDIA Pascal generation CC 6.0" echo " Pascal61 = NVIDIA Pascal generation CC 6.1" - echo " Maxwell50 = NVIDIA Maxwell generation CC 5.0" - echo " Power8 = IBM POWER8 CPUs" - echo " Power9 = IBM POWER9 CPUs" echo "" echo "--compiler=/Path/To/Compiler Set the compiler." echo "--debug,-dbg: Enable Debugging." @@ -142,10 +153,14 @@ do echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc." echo "--with-options=[OPT]: Additional options to Kokkos:" + echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" + echo " disable_profiling = do not compile with profiling hooks" + echo " " echo "--with-cuda-options=[OPT]: Additional options to CUDA:" echo " force_uvm, use_ldg, enable_lambda, rdc" - echo "--make-j=[NUM]: Set -j flag used during build." + echo "--make-j=[NUM]: DEPRECATED: call make with appropriate" + echo " -j flag" exit 0 ;; *) @@ -237,27 +252,27 @@ else KOKKOS_INSTALL_PATH=${KOKKOS_TEST_INSTALL_PATH} fi -mkdir install +mkdir -p install echo "#Makefile to satisfy existens of target kokkos-clean before installing the library" > install/Makefile.kokkos echo "kokkos-clean:" >> install/Makefile.kokkos echo "" >> install/Makefile.kokkos -mkdir core -mkdir core/unit_test -mkdir core/perf_test -mkdir containers -mkdir containers/unit_tests -mkdir containers/performance_tests -mkdir algorithms -mkdir algorithms/unit_tests -mkdir algorithms/performance_tests -mkdir example -mkdir example/fixture -mkdir example/feint -mkdir example/fenl -mkdir example/tutorial +mkdir -p core +mkdir -p core/unit_test +mkdir -p core/perf_test +mkdir -p containers +mkdir -p containers/unit_tests +mkdir -p containers/performance_tests +mkdir -p algorithms +mkdir -p algorithms/unit_tests +mkdir -p algorithms/performance_tests +mkdir -p example +mkdir -p example/fixture +mkdir -p example/feint +mkdir -p example/fenl +mkdir -p example/tutorial if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then - mkdir example/ichol + mkdir -p example/ichol fi KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}" @@ -266,115 +281,115 @@ KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}" echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/unit_test/Makefile echo "" >> core/unit_test/Makefile echo "all:" >> core/unit_test/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS}" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS}" >> core/unit_test/Makefile echo "" >> core/unit_test/Makefile echo "test: all" >> core/unit_test/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} test" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} test" >> core/unit_test/Makefile echo "" >> core/unit_test/Makefile echo "clean:" >> core/unit_test/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/unit_test/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/perf_test/Makefile echo "" >> core/perf_test/Makefile echo "all:" >> core/perf_test/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS}" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS}" >> core/perf_test/Makefile echo "" >> core/perf_test/Makefile echo "test: all" >> core/perf_test/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} test" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} test" >> core/perf_test/Makefile echo "" >> core/perf_test/Makefile echo "clean:" >> core/perf_test/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/perf_test/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/unit_tests/Makefile echo "" >> containers/unit_tests/Makefile echo "all:" >> containers/unit_tests/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/unit_tests/Makefile echo "" >> containers/unit_tests/Makefile echo "test: all" >> containers/unit_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/unit_tests/Makefile echo "" >> containers/unit_tests/Makefile echo "clean:" >> containers/unit_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/unit_tests/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/performance_tests/Makefile echo "" >> containers/performance_tests/Makefile echo "all:" >> containers/performance_tests/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/performance_tests/Makefile echo "" >> containers/performance_tests/Makefile echo "test: all" >> containers/performance_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/performance_tests/Makefile echo "" >> containers/performance_tests/Makefile echo "clean:" >> containers/performance_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/performance_tests/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > algorithms/unit_tests/Makefile echo "" >> algorithms/unit_tests/Makefile echo "all:" >> algorithms/unit_tests/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> algorithms/unit_tests/Makefile echo "" >> algorithms/unit_tests/Makefile echo "test: all" >> algorithms/unit_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> algorithms/unit_tests/Makefile echo "" >> algorithms/unit_tests/Makefile echo "clean:" >> algorithms/unit_tests/Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> algorithms/unit_tests/Makefile KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_TEST_INSTALL_PATH}" echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/fixture/Makefile echo "" >> example/fixture/Makefile echo "all:" >> example/fixture/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS}" >> example/fixture/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS}" >> example/fixture/Makefile echo "" >> example/fixture/Makefile echo "test: all" >> example/fixture/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} test" >> example/fixture/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} test" >> example/fixture/Makefile echo "" >> example/fixture/Makefile echo "clean:" >> example/fixture/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} clean" >> example/fixture/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} clean" >> example/fixture/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/feint/Makefile echo "" >> example/feint/Makefile echo "all:" >> example/feint/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS}" >> example/feint/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS}" >> example/feint/Makefile echo "" >> example/feint/Makefile echo "test: all" >> example/feint/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} test" >> example/feint/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} test" >> example/feint/Makefile echo "" >> example/feint/Makefile echo "clean:" >> example/feint/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} clean" >> example/feint/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} clean" >> example/feint/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/fenl/Makefile echo "" >> example/fenl/Makefile echo "all:" >> example/fenl/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS}" >> example/fenl/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS}" >> example/fenl/Makefile echo "" >> example/fenl/Makefile echo "test: all" >> example/fenl/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} test" >> example/fenl/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} test" >> example/fenl/Makefile echo "" >> example/fenl/Makefile echo "clean:" >> example/fenl/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} clean" >> example/fenl/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} clean" >> example/fenl/Makefile echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/tutorial/Makefile echo "" >> example/tutorial/Makefile echo "build:" >> example/tutorial/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} build">> example/tutorial/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} build">> example/tutorial/Makefile echo "" >> example/tutorial/Makefile echo "test: build" >> example/tutorial/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} test" >> example/tutorial/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} test" >> example/tutorial/Makefile echo "" >> example/tutorial/Makefile echo "clean:" >> example/tutorial/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} clean" >> example/tutorial/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} clean" >> example/tutorial/Makefile if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/ichol/Makefile echo "" >> example/ichol/Makefile echo "all:" >> example/ichol/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS}" >> example/ichol/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS}" >> example/ichol/Makefile echo "" >> example/ichol/Makefile echo "test: all" >> example/ichol/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} test" >> example/ichol/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} test" >> example/ichol/Makefile echo "" >> example/ichol/Makefile echo "clean:" >> example/ichol/Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} clean" >> example/ichol/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} clean" >> example/ichol/Makefile fi KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}" @@ -385,62 +400,64 @@ echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > Makefile echo "" >> Makefile echo "kokkoslib:" >> Makefile echo -e "\tcd core; \\" >> Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} build-lib" >> Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} build-lib" >> Makefile echo "" >> Makefile echo "install: kokkoslib" >> Makefile echo -e "\tcd core; \\" >> Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} install" >> Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} install" >> Makefile echo "" >> Makefile echo "kokkoslib-test:" >> Makefile echo -e "\tcd core; \\" >> Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} build-lib" >> Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} build-lib" >> Makefile echo "" >> Makefile echo "install-test: kokkoslib-test" >> Makefile echo -e "\tcd core; \\" >> Makefile -echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} install" >> Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} install" >> Makefile echo "" >> Makefile echo "build-test: install-test" >> Makefile -echo -e "\tmake -C core/unit_test" >> Makefile -echo -e "\tmake -C core/perf_test" >> Makefile -echo -e "\tmake -C containers/unit_tests" >> Makefile -echo -e "\tmake -C containers/performance_tests" >> Makefile -echo -e "\tmake -C algorithms/unit_tests" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests" >> Makefile if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then -echo -e "\tmake -C example/fixture" >> Makefile -echo -e "\tmake -C example/feint" >> Makefile -echo -e "\tmake -C example/fenl" >> Makefile -echo -e "\tmake -C example/tutorial build" >> Makefile +echo -e "\t\$(MAKE) -C example/fixture" >> Makefile +echo -e "\t\$(MAKE) -C example/feint" >> Makefile +echo -e "\t\$(MAKE) -C example/fenl" >> Makefile +echo -e "\t\$(MAKE) -C example/tutorial build" >> Makefile fi echo "" >> Makefile echo "test: build-test" >> Makefile -echo -e "\tmake -C core/unit_test test" >> Makefile -echo -e "\tmake -C core/perf_test test" >> Makefile -echo -e "\tmake -C containers/unit_tests test" >> Makefile -echo -e "\tmake -C containers/performance_tests test" >> Makefile -echo -e "\tmake -C algorithms/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests test" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then -echo -e "\tmake -C example/fixture test" >> Makefile -echo -e "\tmake -C example/feint test" >> Makefile -echo -e "\tmake -C example/fenl test" >> Makefile -echo -e "\tmake -C example/tutorial test" >> Makefile +echo -e "\t\$(MAKE) -C example/fixture test" >> Makefile +echo -e "\t\$(MAKE) -C example/feint test" >> Makefile +echo -e "\t\$(MAKE) -C example/fenl test" >> Makefile +echo -e "\t\$(MAKE) -C example/tutorial test" >> Makefile fi echo "" >> Makefile echo "unit-tests-only:" >> Makefile -echo -e "\tmake -C core/unit_test test" >> Makefile -echo -e "\tmake -C containers/unit_tests test" >> Makefile -echo -e "\tmake -C algorithms/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile echo "" >> Makefile + echo "clean:" >> Makefile -echo -e "\tmake -C core/unit_test clean" >> Makefile -echo -e "\tmake -C core/perf_test clean" >> Makefile -echo -e "\tmake -C containers/unit_tests clean" >> Makefile -echo -e "\tmake -C containers/performance_tests clean" >> Makefile -echo -e "\tmake -C algorithms/unit_tests clean" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test clean" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test clean" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests clean" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests clean" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests clean" >> Makefile if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then -echo -e "\tmake -C example/fixture clean" >> Makefile -echo -e "\tmake -C example/feint clean" >> Makefile -echo -e "\tmake -C example/fenl clean" >> Makefile -echo -e "\tmake -C example/tutorial clean" >> Makefile +echo -e "\t\$(MAKE) -C example/fixture clean" >> Makefile +echo -e "\t\$(MAKE) -C example/feint clean" >> Makefile +echo -e "\t\$(MAKE) -C example/fenl clean" >> Makefile +echo -e "\t\$(MAKE) -C example/tutorial clean" >> Makefile fi echo -e "\tcd core; \\" >> Makefile -echo -e "\tmake -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} clean" >> Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} clean" >> Makefile +