diff --git a/lib/kokkos/Copyright.txt b/lib/kokkos/Copyright.txt new file mode 100755 index 0000000000000000000000000000000000000000..05980758fa8fe6317bb08fcc6eb70668b5fd1580 --- /dev/null +++ b/lib/kokkos/Copyright.txt @@ -0,0 +1,40 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER diff --git a/lib/kokkos/LICENSE b/lib/kokkos/LICENSE new file mode 100755 index 0000000000000000000000000000000000000000..05980758fa8fe6317bb08fcc6eb70668b5fd1580 --- /dev/null +++ b/lib/kokkos/LICENSE @@ -0,0 +1,40 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos new file mode 100755 index 0000000000000000000000000000000000000000..473039af52bb23ebcb68b5b7494a0c3625b92154 --- /dev/null +++ b/lib/kokkos/Makefile.kokkos @@ -0,0 +1,318 @@ +# Default settings common options + +KOKKOS_PATH=../../lib/kokkos + +#Options: OpenMP,Serial,Pthreads,Cuda +KOKKOS_DEVICES ?= "OpenMP" +#KOKKOS_DEVICES ?= "Pthreads" +#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8 +KOKKOS_ARCH ?= "" +#Options: yes,no +KOKKOS_DEBUG ?= "no" +#Options: hwloc,librt +KOKKOS_USE_TPLS ?= "" + +#Default settings specific options +#Options: force_uvm,use_ldg,rdc +KOKKOS_CUDA_OPTIONS ?= "" + +# Check for general settings + +KOKKOS_CXX_STANDARD ?= "c++11" + +KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l)) +KOKKOS_INTERNAL_ENABLE_PROFILING_COLLECT_KERNEL_DATA := $(strip $(shell echo $(KOKKOS_PROFILING) | grep "kernel_times" | wc -l)) +KOKKOS_INTERNAL_ENABLE_PROFILING_AGGREGATE_MPI := $(strip $(shell echo $(KOKKOS_PROFILING) | grep "aggregate_mpi" | wc -l)) +KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l)) + +# Check for external libraries +KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l)) +KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l)) + +# Check for advanced settings +KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l)) +KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "force_uvm" | wc -l)) +KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l)) + +# Check for Kokkos Host Execution Spaces one of which must be on + +KOKKOS_INTERNAL_USE_OPENMP := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMP | wc -l)) +KOKKOS_INTERNAL_USE_PTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Pthread | wc -l)) +KOKKOS_INTERNAL_USE_SERIAL := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Serial | wc -l)) + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0) +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0) + KOKKOS_INTERNAL_USE_SERIAL := 1 +endif +endif + +KOKKOS_INTERNAL_COMPILER_PGI := $(shell $(CXX) --version | grep PGI | wc -l) + +ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_INTERNAL_OPENMP_FLAG := -mp +else + KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp +endif + +ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_INTERNAL_CXX11_FLAG := --c++11 +else + KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11 +endif +# Check for other Execution Spaces + +KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l)) + +# Check for Kokkos Architecture settings + +#Intel based +KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l)) + +#NVIDIA based +KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler35 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler37 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc)) + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) +KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc)) +endif + +#ARM based +KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8 | wc -l)) + +#IBM based +KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power7 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power8 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc)) + +#AMD based +KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l)) + +#Any AVX? +KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc )) +KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc )) + +#Incompatible flags? +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)>1" | bc )) +KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc)) + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) + $(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) ) +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1) + $(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) ) +endif + +#Generating the list of Flags + +KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src +# No warnings: +KOKKOS_CXXFLAGS = +# INTEL and CLANG warnings: +#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized +# GCC warnings: +#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered + +KOKKOS_LIBS = -lkokkos +KOKKOS_LDFLAGS = -L$(shell pwd) +KOKKOS_SRC = +KOKKOS_HEADERS = + +#Generating the KokkosCore_config.h file + +tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp) +tmp := $(shell echo "Makefile constructed configuration:" >> KokkosCore_config.tmp) +tmp := $(shell date >> KokkosCore_config.tmp) +tmp := $(shell echo "----------------------------------------------*/" >> KokkosCore_config.tmp) + + +tmp := $(shell echo "/* Execution Spaces */" >> KokkosCore_config.tmp) +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp ) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp ) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp ) +endif + +tmp := $(shell echo "/* General Settings */" >> KokkosCore_config.tmp) +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG) + tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp ) +endif + +ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + KOKKOS_CXXFLAGS += -G +endif + KOKKOS_CXXFLAGS += -g + KOKKOS_LDFLAGS += -g -ldl + tmp := $(shell echo "\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK 1" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#define KOKKOS_HAVE_DEBUG 1" >> KokkosCore_config.tmp ) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) + KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include + KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib + KOKKOS_LIBS += -lhwloc + tmp := $(shell echo "\#define KOKKOS_HAVE_HWLOC 1" >> KokkosCore_config.tmp ) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1) + tmp := $(shell echo "\#define KOKKOS_USE_LIBRT 1" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#define PREC_TIMER 1" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#define KOKKOSP_ENABLE_RTLIB 1" >> KokkosCore_config.tmp ) + KOKKOS_LIBS += -lrt +endif + +tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp) + +ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1) + tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp ) +endif + +ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) + tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp ) +endif + +ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1) + tmp := $(shell echo "\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1" >> KokkosCore_config.tmp ) + KOKKOS_CXXFLAGS += --relocatable-device-code=true + KOKKOS_LDFLAGS += --relocatable-device-code=true +endif + +#Add Architecture flags + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) + KOKKOS_CXXFLAGS += -mavx + KOKKOS_LDFLAGS += -mavx +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1) + KOKKOS_CXXFLAGS += -xcore-avx2 + KOKKOS_LDFLAGS += -xcore-avx2 +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1) + KOKKOS_CXXFLAGS += -mmic + KOKKOS_LDFLAGS += -mmic +endif + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) + KOKKOS_CXXFLAGS += -arch=sm_30 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) + KOKKOS_CXXFLAGS += -arch=sm_32 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) + KOKKOS_CXXFLAGS += -arch=sm_35 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) + KOKKOS_CXXFLAGS += -arch=sm_37 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) + KOKKOS_CXXFLAGS += -arch=sm_50 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) + KOKKOS_CXXFLAGS += -arch=sm_52 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) + KOKKOS_CXXFLAGS += -arch=sm_53 +endif +endif + +KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h) +ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h) +KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l)) +else +KOKKOS_INTERNAL_NEW_CONFIG := 1 +endif + +ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) + tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h) +endif + +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp) + +KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp) +KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp) + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp) + KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp) + KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 + KOKKOS_LIBS += -lcudart -lcuda +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + KOKKOS_LIBS += -lpthread + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp) + KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp) + KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp) + ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG) + else + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG) + endif + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG) +endif + + +# Setting up dependencies + +KokkosCore_config.h: + +KOKKOS_CPP_DEPENDS := KokkosCore_config.h $(KOKKOS_HEADERS) + +KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o) +KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) + +include $(KOKKOS_PATH)/Makefile.targets + +kokkos-clean: + rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a + +libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS) + ar cr libkokkos.a $(KOKKOS_OBJ_LINK) + +KOKKOS_LINK_DEPENDS=libkokkos.a diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets new file mode 100755 index 0000000000000000000000000000000000000000..86708ac80176c18d6cd08547c2715a600edcc997 --- /dev/null +++ b/lib/kokkos/Makefile.targets @@ -0,0 +1,50 @@ +Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp +Kokkos_AllocationTracker.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp +Kokkos_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp +Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp +Kokkos_Error.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp +Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp +Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp +Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp +Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp +Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp +Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp +Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp +KokkosExp_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) +Kokkos_Cuda_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp +Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp +Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) +Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp +Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp +Kokkos_Threads_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) +Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp +endif + diff --git a/lib/kokkos/README b/lib/kokkos/README new file mode 100755 index 0000000000000000000000000000000000000000..f979495bfd64ddf3ed12f083e5625920bd372f9c --- /dev/null +++ b/lib/kokkos/README @@ -0,0 +1,97 @@ +Kokkos implements a programming model in C++ for writing performance portable +applications targeting all major HPC platforms. For that purpose it provides +abstractions for both parallel execution of code and data management. +Kokkos is designed to target complex node architectures with N-level memory +hierarchies and multiple types of execution resources. It currently can use +OpenMP, Pthreads and CUDA as backend programming models. + +The core developers of Kokkos are Carter Edwards and Christian Trott +at the Computer Science Research Institute of the Sandia National +Laboratories. + +The KokkosP interface and associated tools are developed by the Application +Performance Team and Kokkos core developers at Sandia National Laboratories. + +To learn more about Kokkos consider watching one of our presentations: +GTC 2015: + http://on-demand.gputechconf.com/gtc/2015/video/S5166.html + http://on-demand.gputechconf.com/gtc/2015/presentation/S5166-H-Carter-Edwards.pdf + +A programming guide can be found under doc/Kokkos_PG.pdf. This is an initial version +and feedback is greatly appreciated. + +For questions please send an email to +kokkos-users@software.sandia.gov + +For non-public questions send an email to +hcedwar(at)sandia.gov and crtrott(at)sandia.gov + +============================================================================ +====Requirements============================================================ +============================================================================ + +Primary tested compilers are: + GCC 4.7.2 + GCC 5.1.0 + Intel 14.0.1 + Intel 15.0.1 + Clang 3.7.0 + +Secondary tested compilers are: + CUDA 6.5 + CUDA 7.0 + +Primary tested compiler are passing in release mode +with warnings as errors. We are using the following set +of flags: +GCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits + -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized +Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized +Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized + + +============================================================================ +====Getting started========================================================= +============================================================================ + +In the 'example/tutorial' directory you will find step by step tutorial +examples which explain many of the features of Kokkos. They work with +simple Makefiles. To build with g++ and OpenMP simply type 'make openmp' +in the 'example/tutorial' directory. This will build all examples in the +subfolders. + +============================================================================ +====Running Unit Tests====================================================== +============================================================================ + +To run the unit tests create a build directory and run the following commands + +KOKKOS_PATH/generate_makefile.bash +make build-test +make test + +Run KOKKOS_PATH/generate_makefile.bash --help for more detailed options such as +changing the device type for which to build. + +============================================================================ +====Install the library===================================================== +============================================================================ + +To install Kokkos as a library create a build directory and run the following + +KOKKOS_PATH/generate_makefile.bash --prefix=INSTALL_PATH +make lib +make install + +KOKKOS_PATH/generate_makefile.bash --help for more detailed options such as +changing the device type for which to build. + +============================================================================ +====CMakeFiles============================================================== +============================================================================ + +The CMake files contained in this repository require Tribits and are used +for integration with Trilinos. They do not currently support a standalone +CMake build. + + diff --git a/lib/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp b/lib/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp new file mode 100755 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp new file mode 100755 index 0000000000000000000000000000000000000000..11763c2f10d317ab01940f1df8a32d3923a98fbf --- /dev/null +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -0,0 +1,1691 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_RANDOM_HPP +#define KOKKOS_RANDOM_HPP + +#include <Kokkos_Core.hpp> +#include <cstdio> +#include <cstdlib> +#include <cmath> + +/// \file Kokkos_Random.hpp +/// \brief Pseudorandom number generators +/// +/// These generators are based on Vigna, Sebastiano (2014). "An +/// experimental exploration of Marsaglia's xorshift generators, +/// scrambled." See: http://arxiv.org/abs/1402.6246 + +namespace Kokkos { + + /*Template functions to get equidistributed random numbers from a generator for a specific Scalar type + + template<class Generator,Scalar> + struct rand{ + + //Max value returned by draw(Generator& gen) + KOKKOS_INLINE_FUNCTION + static Scalar max(); + + //Returns a value between zero and max() + KOKKOS_INLINE_FUNCTION + static Scalar draw(Generator& gen); + + //Returns a value between zero and range() + //Note: for floating point values range can be larger than max() + KOKKOS_INLINE_FUNCTION + static Scalar draw(Generator& gen, const Scalar& range){} + + //Return value between start and end + KOKKOS_INLINE_FUNCTION + static Scalar draw(Generator& gen, const Scalar& start, const Scalar& end); + }; + + The Random number generators themselves have two components a state-pool and the actual generator + A state-pool manages a number of generators, so that each active thread is able to grep its own. + This allows the generation of random numbers which are independent between threads. Note that + in contrast to CuRand none of the functions of the pool (or the generator) are collectives, + i.e. all functions can be called inside conditionals. + + template<class Device> + class Pool { + public: + //The Kokkos device type + typedef Device device_type; + //The actual generator type + typedef Generator<Device> generator_type; + + //Default constructor: does not initialize a pool + Pool(); + + //Initializing constructor: calls init(seed,Device_Specific_Number); + Pool(unsigned int seed); + + //Intialize Pool with seed as a starting seed with a pool_size of num_states + //The Random_XorShift64 generator is used in serial to initialize all states, + //thus the intialization process is platform independent and deterministic. + void init(unsigned int seed, int num_states); + + //Get a generator. This will lock one of the states, guaranteeing that each thread + //will have its private generator. Note: on Cuda getting a state involves atomics, + //and is thus not deterministic! + generator_type get_state(); + + //Give a state back to the pool. This unlocks the state, and writes the modified + //state of the generator back to the pool. + void free_state(generator_type gen); + + } + + template<class Device> + class Generator { + public: + //The Kokkos device type + typedef DeviceType device_type; + + //Max return values of respective [X]rand[S]() functions + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast<int>(0xffffffffU/2)}; + enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)}; + + + //Init with a state and the idx with respect to pool. Note: in serial the + //Generator can be used by just giving it the necessary state arguments + KOKKOS_INLINE_FUNCTION + Generator (STATE_ARGUMENTS, int state_idx = 0); + + //Draw a equidistributed uint32_t in the range (0,MAX_URAND] + KOKKOS_INLINE_FUNCTION + uint32_t urand(); + + //Draw a equidistributed uint64_t in the range (0,MAX_URAND64] + KOKKOS_INLINE_FUNCTION + uint64_t urand64(); + + //Draw a equidistributed uint32_t in the range (0,range] + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range); + + //Draw a equidistributed uint32_t in the range (start,end] + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ); + + //Draw a equidistributed uint64_t in the range (0,range] + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range); + + //Draw a equidistributed uint64_t in the range (start,end] + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ); + + //Draw a equidistributed int in the range (0,MAX_RAND] + KOKKOS_INLINE_FUNCTION + int rand(); + + //Draw a equidistributed int in the range (0,range] + KOKKOS_INLINE_FUNCTION + int rand(const int& range); + + //Draw a equidistributed int in the range (start,end] + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ); + + //Draw a equidistributed int64_t in the range (0,MAX_RAND64] + KOKKOS_INLINE_FUNCTION + int64_t rand64(); + + //Draw a equidistributed int64_t in the range (0,range] + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range); + + //Draw a equidistributed int64_t in the range (start,end] + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ); + + //Draw a equidistributed float in the range (0,1.0] + KOKKOS_INLINE_FUNCTION + float frand(); + + //Draw a equidistributed float in the range (0,range] + KOKKOS_INLINE_FUNCTION + float frand(const float& range); + + //Draw a equidistributed float in the range (start,end] + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ); + + //Draw a equidistributed double in the range (0,1.0] + KOKKOS_INLINE_FUNCTION + double drand(); + + //Draw a equidistributed double in the range (0,range] + KOKKOS_INLINE_FUNCTION + double drand(const double& range); + + //Draw a equidistributed double in the range (start,end] + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ); + + //Draw a standard normal distributed double + KOKKOS_INLINE_FUNCTION + double normal() ; + + //Draw a normal distributed double with given mean and standard deviation + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0); + } + + //Additional Functions: + + //Fills view with random numbers in the range (0,range] + template<class ViewType, class PoolType> + void fill_random(ViewType view, PoolType pool, ViewType::value_type range); + + //Fills view with random numbers in the range (start,end] + template<class ViewType, class PoolType> + void fill_random(ViewType view, PoolType pool, + ViewType::value_type start, ViewType::value_type end); + +*/ + + template<class Generator, class Scalar> + struct rand; + + + template<class Generator> + struct rand<Generator,char> { + + KOKKOS_INLINE_FUNCTION + static short max(){return 127;} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen) + {return short((gen.rand()&0xff+256)%256);} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const char& range) + {return char(gen.rand(range));} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const char& start, const char& end) + {return char(gen.rand(start,end));} + + }; + + template<class Generator> + struct rand<Generator,short> { + KOKKOS_INLINE_FUNCTION + static short max(){return 32767;} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen) + {return short((gen.rand()&0xffff+65536)%32768);} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const short& range) + {return short(gen.rand(range));} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const short& start, const short& end) + {return short(gen.rand(start,end));} + + }; + + template<class Generator> + struct rand<Generator,int> { + KOKKOS_INLINE_FUNCTION + static int max(){return Generator::MAX_RAND;} + KOKKOS_INLINE_FUNCTION + static int draw(Generator& gen) + {return gen.rand();} + KOKKOS_INLINE_FUNCTION + static int draw(Generator& gen, const int& range) + {return gen.rand(range);} + KOKKOS_INLINE_FUNCTION + static int draw(Generator& gen, const int& start, const int& end) + {return gen.rand(start,end);} + + }; + + template<class Generator> + struct rand<Generator,unsigned int> { + KOKKOS_INLINE_FUNCTION + static unsigned int max () { + return Generator::MAX_URAND; + } + KOKKOS_INLINE_FUNCTION + static unsigned int draw (Generator& gen) { + return gen.urand (); + } + KOKKOS_INLINE_FUNCTION + static unsigned int draw(Generator& gen, const unsigned int& range) { + return gen.urand (range); + } + KOKKOS_INLINE_FUNCTION + static unsigned int + draw (Generator& gen, const unsigned int& start, const unsigned int& end) { + return gen.urand (start, end); + } + }; + + template<class Generator> + struct rand<Generator,long> { + KOKKOS_INLINE_FUNCTION + static long max () { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof (long) == 4 ? + static_cast<long> (Generator::MAX_RAND) : + static_cast<long> (Generator::MAX_RAND64); + } + KOKKOS_INLINE_FUNCTION + static long draw (Generator& gen) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof (long) == 4 ? + static_cast<long> (gen.rand ()) : + static_cast<long> (gen.rand64 ()); + } + KOKKOS_INLINE_FUNCTION + static long draw (Generator& gen, const long& range) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof (long) == 4 ? + static_cast<long> (gen.rand (static_cast<int> (range))) : + static_cast<long> (gen.rand64 (range)); + } + KOKKOS_INLINE_FUNCTION + static long draw (Generator& gen, const long& start, const long& end) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof (long) == 4 ? + static_cast<long> (gen.rand (static_cast<int> (start), + static_cast<int> (end))) : + static_cast<long> (gen.rand64 (start, end)); + } + }; + + template<class Generator> + struct rand<Generator,unsigned long> { + KOKKOS_INLINE_FUNCTION + static unsigned long max () { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof (unsigned long) == 4 ? + static_cast<unsigned long> (Generator::MAX_URAND) : + static_cast<unsigned long> (Generator::MAX_URAND64); + } + KOKKOS_INLINE_FUNCTION + static unsigned long draw (Generator& gen) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof (unsigned long) == 4 ? + static_cast<unsigned long> (gen.urand ()) : + static_cast<unsigned long> (gen.urand64 ()); + } + KOKKOS_INLINE_FUNCTION + static unsigned long draw(Generator& gen, const unsigned long& range) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof (unsigned long) == 4 ? + static_cast<unsigned long> (gen.urand (static_cast<unsigned int> (range))) : + static_cast<unsigned long> (gen.urand64 (range)); + } + KOKKOS_INLINE_FUNCTION + static unsigned long + draw (Generator& gen, const unsigned long& start, const unsigned long& end) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof (unsigned long) == 4 ? + static_cast<unsigned long> (gen.urand (static_cast<unsigned int> (start), + static_cast<unsigned int> (end))) : + static_cast<unsigned long> (gen.urand64 (start, end)); + } + }; + + // NOTE (mfh 26 oct 2014) This is a partial specialization for long + // long, a C99 / C++11 signed type which is guaranteed to be at + // least 64 bits. Do NOT write a partial specialization for + // int64_t!!! This is just a typedef! It could be either long or + // long long. We don't know which a priori, and I've seen both. + // The types long and long long are guaranteed to differ, so it's + // always safe to specialize for both. + template<class Generator> + struct rand<Generator, long long> { + KOKKOS_INLINE_FUNCTION + static long long max () { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return Generator::MAX_RAND64; + } + KOKKOS_INLINE_FUNCTION + static long long draw (Generator& gen) { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return gen.rand64 (); + } + KOKKOS_INLINE_FUNCTION + static long long draw (Generator& gen, const long long& range) { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return gen.rand64 (range); + } + KOKKOS_INLINE_FUNCTION + static long long draw (Generator& gen, const long long& start, const long long& end) { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return gen.rand64 (start, end); + } + }; + + // NOTE (mfh 26 oct 2014) This is a partial specialization for + // unsigned long long, a C99 / C++11 unsigned type which is + // guaranteed to be at least 64 bits. Do NOT write a partial + // specialization for uint64_t!!! This is just a typedef! It could + // be either unsigned long or unsigned long long. We don't know + // which a priori, and I've seen both. The types unsigned long and + // unsigned long long are guaranteed to differ, so it's always safe + // to specialize for both. + template<class Generator> + struct rand<Generator,unsigned long long> { + KOKKOS_INLINE_FUNCTION + static unsigned long long max () { + // FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64 bits. + return Generator::MAX_URAND64; + } + KOKKOS_INLINE_FUNCTION + static unsigned long long draw (Generator& gen) { + // FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64 bits. + return gen.urand64 (); + } + KOKKOS_INLINE_FUNCTION + static unsigned long long draw (Generator& gen, const unsigned long long& range) { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return gen.urand64 (range); + } + KOKKOS_INLINE_FUNCTION + static unsigned long long + draw (Generator& gen, const unsigned long long& start, const unsigned long long& end) { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return gen.urand64 (start, end); + } + }; + + template<class Generator> + struct rand<Generator,float> { + KOKKOS_INLINE_FUNCTION + static float max(){return 1.0f;} + KOKKOS_INLINE_FUNCTION + static float draw(Generator& gen) + {return gen.frand();} + KOKKOS_INLINE_FUNCTION + static float draw(Generator& gen, const float& range) + {return gen.frand(range);} + KOKKOS_INLINE_FUNCTION + static float draw(Generator& gen, const float& start, const float& end) + {return gen.frand(start,end);} + + }; + + template<class Generator> + struct rand<Generator,double> { + KOKKOS_INLINE_FUNCTION + static double max(){return 1.0;} + KOKKOS_INLINE_FUNCTION + static double draw(Generator& gen) + {return gen.drand();} + KOKKOS_INLINE_FUNCTION + static double draw(Generator& gen, const double& range) + {return gen.drand(range);} + KOKKOS_INLINE_FUNCTION + static double draw(Generator& gen, const double& start, const double& end) + {return gen.drand(start,end);} + + }; + + template<class DeviceType> + class Random_XorShift64_Pool; + + template<class DeviceType> + class Random_XorShift64 { + private: + uint64_t state_; + const int state_idx_; + friend class Random_XorShift64_Pool<DeviceType>; + public: + + typedef DeviceType device_type; + + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast<int>(0xffffffff/2)}; + enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffLL/2-1)}; + + KOKKOS_INLINE_FUNCTION + Random_XorShift64 (uint64_t state, int state_idx = 0) + : state_(state),state_idx_(state_idx){} + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + state_ ^= state_ >> 12; + state_ ^= state_ << 25; + state_ ^= state_ >> 27; + + uint64_t tmp = state_ * 2685821657736338717ULL; + tmp = tmp>>16; + return static_cast<uint32_t>(tmp&MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + state_ ^= state_ >> 12; + state_ ^= state_ << 25; + state_ ^= state_ >> 27; + return (state_ * 2685821657736338717ULL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND/range)*range; + uint32_t tmp = urand(); + while(tmp>=max_val) + tmp = urand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ) { + return urand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64/range)*range; + uint64_t tmp = urand64(); + while(tmp>=max_val) + tmp = urand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ) { + return urand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { + return static_cast<int>(urand()/2); + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND/range)*range; + int tmp = rand(); + while(tmp>=max_val) + tmp = rand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ) { + return rand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { + return static_cast<int64_t>(urand64()/2); + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64/range)*range; + int64_t tmp = rand64(); + while(tmp>=max_val) + tmp = rand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ) { + return rand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { + return 1.0f * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ) { + return frand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { + return 1.0 * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ) { + return drand(end-start)+start; + } + + //Marsaglia polar method for drawing a standard normal distributed random number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while(S>=1.0) { + U = drand(); + const double V = drand(); + S = U*U+V*V; + } + return U*sqrt(-2.0*log(S)/S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0) { + return mean + normal()*std_dev; + } + + }; + + template<class DeviceType = Kokkos::DefaultExecutionSpace> + class Random_XorShift64_Pool { + private: + typedef View<int*,DeviceType> lock_type; + typedef View<uint64_t*,DeviceType> state_data_type; + lock_type locks_; + state_data_type state_; + int num_states_; + + public: + typedef Random_XorShift64<DeviceType> generator_type; + typedef DeviceType device_type; + + Random_XorShift64_Pool() { + num_states_ = 0; + } + Random_XorShift64_Pool(uint64_t seed) { + num_states_ = 0; + init(seed,DeviceType::max_hardware_threads()); + } + + Random_XorShift64_Pool(const Random_XorShift64_Pool& src): + locks_(src.locks_), + state_(src.state_), + num_states_(src.num_states_) + {} + + Random_XorShift64_Pool operator = (const Random_XorShift64_Pool& src) { + locks_ = src.locks_; + state_ = src.state_; + num_states_ = src.num_states_; + return *this; + } + + void init(uint64_t seed, int num_states) { + num_states_ = num_states; + + locks_ = lock_type("Kokkos::Random_XorShift64::locks",num_states_); + state_ = state_data_type("Kokkos::Random_XorShift64::state",num_states_); + + typename state_data_type::HostMirror h_state = create_mirror_view(state_); + typename lock_type::HostMirror h_lock = create_mirror_view(locks_); + + // Execute on the HostMirror's default execution space. + Random_XorShift64<typename state_data_type::HostMirror::execution_space> gen(seed,0); + for(int i = 0; i < 17; i++) + gen.rand(); + for(int i = 0; i < num_states_; i++) { + int n1 = gen.rand(); + int n2 = gen.rand(); + int n3 = gen.rand(); + int n4 = gen.rand(); + h_state(i) = (((static_cast<uint64_t>(n1)) & 0xffff)<<00) | + (((static_cast<uint64_t>(n2)) & 0xffff)<<16) | + (((static_cast<uint64_t>(n3)) & 0xffff)<<32) | + (((static_cast<uint64_t>(n4)) & 0xffff)<<48); + h_lock(i) = 0; + } + deep_copy(state_,h_state); + deep_copy(locks_,h_lock); + } + + KOKKOS_INLINE_FUNCTION + Random_XorShift64<DeviceType> get_state() const { + const int i = DeviceType::hardware_thread_id();; + return Random_XorShift64<DeviceType>(state_(i),i); + } + + KOKKOS_INLINE_FUNCTION + void free_state(const Random_XorShift64<DeviceType>& state) const { + state_(state.state_idx_) = state.state_; + } + }; + + + template<class DeviceType> + class Random_XorShift1024_Pool; + + template<class DeviceType> + class Random_XorShift1024 { + private: + int p_; + const int state_idx_; + uint64_t state_[16]; + friend class Random_XorShift1024_Pool<DeviceType>; + public: + + typedef DeviceType device_type; + + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast<int>(0xffffffffU/2)}; + enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)}; + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0): + p_(p),state_idx_(state_idx){ + for(int i=0 ; i<16; i++) + state_[i] = state[i]; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + uint64_t state_0 = state_[ p_ ]; + uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL; + tmp = tmp>>16; + return static_cast<uint32_t>(tmp&MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + uint64_t state_0 = state_[ p_ ]; + uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND/range)*range; + uint32_t tmp = urand(); + while(tmp>=max_val) + tmp = urand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ) { + return urand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64/range)*range; + uint64_t tmp = urand64(); + while(tmp>=max_val) + tmp = urand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ) { + return urand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { + return static_cast<int>(urand()/2); + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND/range)*range; + int tmp = rand(); + while(tmp>=max_val) + tmp = rand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ) { + return rand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { + return static_cast<int64_t>(urand64()/2); + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64/range)*range; + int64_t tmp = rand64(); + while(tmp>=max_val) + tmp = rand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ) { + return rand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { + return 1.0f * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ) { + return frand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { + return 1.0 * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ) { + return frand(end-start)+start; + } + + //Marsaglia polar method for drawing a standard normal distributed random number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while(S>=1.0) { + U = drand(); + const double V = drand(); + S = U*U+V*V; + } + return U*sqrt(-2.0*log(S)/S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0) { + return mean + normal()*std_dev; + } + }; + + + template<class DeviceType = Kokkos::DefaultExecutionSpace> + class Random_XorShift1024_Pool { + private: + typedef View<int*,DeviceType> int_view_type; + typedef View<uint64_t*[16],DeviceType> state_data_type; + + int_view_type locks_; + state_data_type state_; + int_view_type p_; + int num_states_; + + public: + typedef Random_XorShift1024<DeviceType> generator_type; + + typedef DeviceType device_type; + + Random_XorShift1024_Pool() { + num_states_ = 0; + } + + inline + Random_XorShift1024_Pool(uint64_t seed){ + num_states_ = 0; + init(seed,DeviceType::max_hardware_threads()); + } + + Random_XorShift1024_Pool(const Random_XorShift1024_Pool& src): + locks_(src.locks_), + state_(src.state_), + p_(src.p_), + num_states_(src.num_states_) + {} + + Random_XorShift1024_Pool operator = (const Random_XorShift1024_Pool& src) { + locks_ = src.locks_; + state_ = src.state_; + p_ = src.p_; + num_states_ = src.num_states_; + return *this; + } + + inline + void init(uint64_t seed, int num_states) { + num_states_ = num_states; + + locks_ = int_view_type("Kokkos::Random_XorShift1024::locks",num_states_); + state_ = state_data_type("Kokkos::Random_XorShift1024::state",num_states_); + p_ = int_view_type("Kokkos::Random_XorShift1024::p",num_states_); + + typename state_data_type::HostMirror h_state = create_mirror_view(state_); + typename int_view_type::HostMirror h_lock = create_mirror_view(locks_); + typename int_view_type::HostMirror h_p = create_mirror_view(p_); + + // Execute on the HostMirror's default execution space. + Random_XorShift64<typename state_data_type::HostMirror::execution_space> gen(seed,0); + for(int i = 0; i < 17; i++) + gen.rand(); + for(int i = 0; i < num_states_; i++) { + for(int j = 0; j < 16 ; j++) { + int n1 = gen.rand(); + int n2 = gen.rand(); + int n3 = gen.rand(); + int n4 = gen.rand(); + h_state(i,j) = (((static_cast<uint64_t>(n1)) & 0xffff)<<00) | + (((static_cast<uint64_t>(n2)) & 0xffff)<<16) | + (((static_cast<uint64_t>(n3)) & 0xffff)<<32) | + (((static_cast<uint64_t>(n4)) & 0xffff)<<48); + } + h_p(i) = 0; + h_lock(i) = 0; + } + deep_copy(state_,h_state); + deep_copy(locks_,h_lock); + } + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024<DeviceType> get_state() const { + const int i = DeviceType::hardware_thread_id(); + return Random_XorShift1024<DeviceType>(&state_(i,0),p_(i),i); + }; + + KOKKOS_INLINE_FUNCTION + void free_state(const Random_XorShift1024<DeviceType>& state) const { + for(int i = 0; i<16; i++) + state_(state.state_idx_,i) = state.state_[i]; + p_(state.state_idx_) = state.p_; + } + }; + +#if defined(KOKKOS_HAVE_CUDA) && defined(__CUDACC__) + + template<> + class Random_XorShift1024<Kokkos::Cuda> { + private: + int p_; + const int state_idx_; + uint64_t* state_; + friend class Random_XorShift1024_Pool<Kokkos::Cuda>; + public: + + typedef Kokkos::Cuda device_type; + + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast<int>(0xffffffffU/2)}; + enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)}; + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0): + p_(p),state_idx_(state_idx),state_(state){ + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + uint64_t state_0 = state_[ p_ ]; + uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL; + tmp = tmp>>16; + return static_cast<uint32_t>(tmp&MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + uint64_t state_0 = state_[ p_ ]; + uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND/range)*range; + uint32_t tmp = urand(); + while(tmp>=max_val) + urand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ) { + return urand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64/range)*range; + uint64_t tmp = urand64(); + while(tmp>=max_val) + urand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ) { + return urand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { + return static_cast<int>(urand()/2); + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND/range)*range; + int tmp = rand(); + while(tmp>=max_val) + rand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ) { + return rand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { + return static_cast<int64_t>(urand64()/2); + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64/range)*range; + int64_t tmp = rand64(); + while(tmp>=max_val) + rand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ) { + return rand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { + return 1.0f * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ) { + return frand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { + return 1.0 * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ) { + return frand(end-start)+start; + } + + //Marsaglia polar method for drawing a standard normal distributed random number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while(S>=1.0) { + U = drand(); + const double V = drand(); + S = U*U+V*V; + } + return U*sqrt(-2.0*log(S)/S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0) { + return mean + normal()*std_dev; + } + }; + +template<> +inline +Random_XorShift64_Pool<Kokkos::Cuda>::Random_XorShift64_Pool(uint64_t seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift64<Kokkos::Cuda> Random_XorShift64_Pool<Kokkos::Cuda>::get_state() const { +#ifdef __CUDA_ARCH__ + const int i_offset = (threadIdx.x*blockDim.y + threadIdx.y)*blockDim.z+threadIdx.z; + int i = ((blockIdx.x*gridDim.y+blockIdx.y)*gridDim.z + blockIdx.z) * + blockDim.x*blockDim.y*blockDim.z + i_offset; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim.x*blockDim.y*blockDim.z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift64<Kokkos::Cuda>(state_(i),i); +#else + return Random_XorShift64<Kokkos::Cuda>(state_(0),0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift64_Pool<Kokkos::Cuda>::free_state(const Random_XorShift64<Kokkos::Cuda> &state) const { +#ifdef __CUDA_ARCH__ + state_(state.state_idx_) = state.state_; + locks_(state.state_idx_) = 0; + return; +#endif +} + + +template<> +inline +Random_XorShift1024_Pool<Kokkos::Cuda>::Random_XorShift1024_Pool(uint64_t seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_state() const { +#ifdef __CUDA_ARCH__ + const int i_offset = (threadIdx.x*blockDim.y + threadIdx.y)*blockDim.z+threadIdx.z; + int i = ((blockIdx.x*gridDim.y+blockIdx.y)*gridDim.z + blockIdx.z) * + blockDim.x*blockDim.y*blockDim.z + i_offset; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim.x*blockDim.y*blockDim.z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift1024<Kokkos::Cuda>(&state_(i,0), p_(i), i); +#else + return Random_XorShift1024<Kokkos::Cuda>(&state_(0,0), p_(0), 0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift1024<Kokkos::Cuda> &state) const { +#ifdef __CUDA_ARCH__ + for(int i=0; i<16; i++) + state_(state.state_idx_,i) = state.state_[i]; + locks_(state.state_idx_) = 0; + return; +#endif +} + + +#endif + + + +template<class ViewType, class RandomPool, int loops, int rank> +struct fill_random_functor_range; +template<class ViewType, class RandomPool, int loops, int rank> +struct fill_random_functor_begin_end; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_range<ViewType,RandomPool,loops,1>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) + a(idx) = Rand::draw(gen,range); + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_range<ViewType,RandomPool,loops,2>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + a(idx,k) = Rand::draw(gen,range); + } + } + rand_pool.free_state(gen); + } +}; + + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_range<ViewType,RandomPool,loops,3>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + for(unsigned int l=0;l<a.dimension_2();l++) + a(idx,k,l) = Rand::draw(gen,range); + } + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_range<ViewType,RandomPool,loops,4>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + for(unsigned int l=0;l<a.dimension_2();l++) + for(unsigned int m=0;m<a.dimension_3();m++) + a(idx,k,l,m) = Rand::draw(gen,range); + } + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_range<ViewType,RandomPool,loops,5>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + for(unsigned int l=0;l<a.dimension_2();l++) + for(unsigned int m=0;m<a.dimension_3();m++) + for(unsigned int n=0;n<a.dimension_4();n++) + a(idx,k,l,m,n) = Rand::draw(gen,range); + } + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_range<ViewType,RandomPool,loops,6>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + for(unsigned int l=0;l<a.dimension_2();l++) + for(unsigned int m=0;m<a.dimension_3();m++) + for(unsigned int n=0;n<a.dimension_4();n++) + for(unsigned int o=0;o<a.dimension_5();o++) + a(idx,k,l,m,n,o) = Rand::draw(gen,range); + } + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_range<ViewType,RandomPool,loops,7>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + for(unsigned int l=0;l<a.dimension_2();l++) + for(unsigned int m=0;m<a.dimension_3();m++) + for(unsigned int n=0;n<a.dimension_4();n++) + for(unsigned int o=0;o<a.dimension_5();o++) + for(unsigned int p=0;p<a.dimension_6();p++) + a(idx,k,l,m,n,o,p) = Rand::draw(gen,range); + } + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_range<ViewType,RandomPool,loops,8>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + for(unsigned int l=0;l<a.dimension_2();l++) + for(unsigned int m=0;m<a.dimension_3();m++) + for(unsigned int n=0;n<a.dimension_4();n++) + for(unsigned int o=0;o<a.dimension_5();o++) + for(unsigned int p=0;p<a.dimension_6();p++) + for(unsigned int q=0;q<a.dimension_7();q++) + a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range); + } + } + rand_pool.free_state(gen); + } +}; +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) + a(idx) = Rand::draw(gen,begin,end); + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + a(idx,k) = Rand::draw(gen,begin,end); + } + } + rand_pool.free_state(gen); + } +}; + + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + for(unsigned int l=0;l<a.dimension_2();l++) + a(idx,k,l) = Rand::draw(gen,begin,end); + } + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + for(unsigned int l=0;l<a.dimension_2();l++) + for(unsigned int m=0;m<a.dimension_3();m++) + a(idx,k,l,m) = Rand::draw(gen,begin,end); + } + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()){ + for(unsigned int l=0;l<a.dimension_1();l++) + for(unsigned int m=0;m<a.dimension_2();m++) + for(unsigned int n=0;n<a.dimension_3();n++) + for(unsigned int o=0;o<a.dimension_4();o++) + a(idx,l,m,n,o) = Rand::draw(gen,begin,end); + } + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + for(unsigned int l=0;l<a.dimension_2();l++) + for(unsigned int m=0;m<a.dimension_3();m++) + for(unsigned int n=0;n<a.dimension_4();n++) + for(unsigned int o=0;o<a.dimension_5();o++) + a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end); + } + } + rand_pool.free_state(gen); + } +}; + + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + for(unsigned int l=0;l<a.dimension_2();l++) + for(unsigned int m=0;m<a.dimension_3();m++) + for(unsigned int n=0;n<a.dimension_4();n++) + for(unsigned int o=0;o<a.dimension_5();o++) + for(unsigned int p=0;p<a.dimension_6();p++) + a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end); + } + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool, int loops> +struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{ + typedef typename ViewType::execution_space execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j<loops;j++) { + const uint64_t idx = i*loops+j; + if(idx<a.dimension_0()) { + for(unsigned int k=0;k<a.dimension_1();k++) + for(unsigned int l=0;l<a.dimension_2();l++) + for(unsigned int m=0;m<a.dimension_3();m++) + for(unsigned int n=0;n<a.dimension_4();n++) + for(unsigned int o=0;o<a.dimension_5();o++) + for(unsigned int p=0;p<a.dimension_6();p++) + for(unsigned int q=0;q<a.dimension_7();q++) + a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end); + } + } + rand_pool.free_state(gen); + } +}; + +template<class ViewType, class RandomPool> +void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) { + int64_t LDA = a.dimension_0(); + if(LDA>0) + parallel_for((LDA+127)/128,fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank>(a,g,range)); +} + +template<class ViewType, class RandomPool> +void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) { + int64_t LDA = a.dimension_0(); + if(LDA>0) + parallel_for((LDA+127)/128,fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank>(a,g,begin,end)); +} +} + +#endif diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp new file mode 100755 index 0000000000000000000000000000000000000000..8d97472aa9f0838d6d0a740a7717f21015d35639 --- /dev/null +++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp @@ -0,0 +1,496 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#ifndef KOKKOS_SORT_HPP_ +#define KOKKOS_SORT_HPP_ + +#include <Kokkos_Core.hpp> + +#include <algorithm> + +namespace Kokkos { + + namespace SortImpl { + + template<class ValuesViewType, int Rank=ValuesViewType::Rank> + struct CopyOp; + + template<class ValuesViewType> + struct CopyOp<ValuesViewType,1> { + template<class DstType, class SrcType> + KOKKOS_INLINE_FUNCTION + static void copy(DstType& dst, size_t i_dst, + SrcType& src, size_t i_src ) { + dst(i_dst) = src(i_src); + } + }; + + template<class ValuesViewType> + struct CopyOp<ValuesViewType,2> { + template<class DstType, class SrcType> + KOKKOS_INLINE_FUNCTION + static void copy(DstType& dst, size_t i_dst, + SrcType& src, size_t i_src ) { + for(int j = 0;j< (int) dst.dimension_1(); j++) + dst(i_dst,j) = src(i_src,j); + } + }; + + template<class ValuesViewType> + struct CopyOp<ValuesViewType,3> { + template<class DstType, class SrcType> + KOKKOS_INLINE_FUNCTION + static void copy(DstType& dst, size_t i_dst, + SrcType& src, size_t i_src ) { + for(int j = 0; j<dst.dimension_1(); j++) + for(int k = 0; k<dst.dimension_2(); k++) + dst(i_dst,j,k) = src(i_src,j,k); + } + }; + } + +template<class KeyViewType, class BinSortOp, class ExecutionSpace = typename KeyViewType::execution_space, + class SizeType = typename KeyViewType::memory_space::size_type> +class BinSort { + + +public: + template<class ValuesViewType, class PermuteViewType, class CopyOp> + struct bin_sort_sort_functor { + typedef ExecutionSpace execution_space; + typedef typename ValuesViewType::non_const_type values_view_type; + typedef typename ValuesViewType::const_type const_values_view_type; + Kokkos::View<typename values_view_type::const_data_type,typename values_view_type::array_layout, + typename values_view_type::memory_space,Kokkos::MemoryTraits<Kokkos::RandomAccess> > values; + values_view_type sorted_values; + typename PermuteViewType::const_type sort_order; + bin_sort_sort_functor(const_values_view_type values_, values_view_type sorted_values_, PermuteViewType sort_order_): + values(values_),sorted_values(sorted_values_),sort_order(sort_order_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + //printf("Sort: %i %i\n",i,sort_order(i)); + CopyOp::copy(sorted_values,i,values,sort_order(i)); + } + }; + + typedef ExecutionSpace execution_space; + typedef BinSortOp bin_op_type; + + struct bin_count_tag {}; + struct bin_offset_tag {}; + struct bin_binning_tag {}; + struct bin_sort_bins_tag {}; + +public: + typedef SizeType size_type; + typedef size_type value_type; + + typedef Kokkos::View<size_type*, execution_space> offset_type; + typedef Kokkos::View<const int*, execution_space> bin_count_type; + + + typedef Kokkos::View<typename KeyViewType::const_data_type, + typename KeyViewType::array_layout, + typename KeyViewType::memory_space> const_key_view_type; + typedef Kokkos::View<typename KeyViewType::const_data_type, + typename KeyViewType::array_layout, + typename KeyViewType::memory_space, + Kokkos::MemoryTraits<Kokkos::RandomAccess> > const_rnd_key_view_type; + + typedef typename KeyViewType::non_const_value_type non_const_key_scalar; + typedef typename KeyViewType::const_value_type const_key_scalar; + +private: + const_key_view_type keys; + const_rnd_key_view_type keys_rnd; + +public: + BinSortOp bin_op; + + offset_type bin_offsets; + + Kokkos::View<int*, ExecutionSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic; + bin_count_type bin_count_const; + + offset_type sort_order; + + bool sort_within_bins; + +public: + + // Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false) + BinSort(const_key_view_type keys_, BinSortOp bin_op_, + bool sort_within_bins_ = false) + :keys(keys_),keys_rnd(keys_), bin_op(bin_op_) { + + bin_count_atomic = Kokkos::View<int*, ExecutionSpace >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins()); + bin_count_const = bin_count_atomic; + bin_offsets = offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins()); + sort_order = offset_type("PermutationVector",keys.dimension_0()); + sort_within_bins = sort_within_bins_; + } + + // Create the permutation vector, the bin_offset array and the bin_count array. Can be called again if keys changed + void create_permute_vector() { + Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_count_tag> (0,keys.dimension_0()),*this); + Kokkos::parallel_scan(Kokkos::RangePolicy<ExecutionSpace,bin_offset_tag> (0,bin_op.max_bins()) ,*this); + + Kokkos::deep_copy(bin_count_atomic,0); + Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_binning_tag> (0,keys.dimension_0()),*this); + + if(sort_within_bins) + Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this); + } + + // Sort a view with respect ot the first dimension using the permutation array + template<class ValuesViewType> + void sort(ValuesViewType values) { + ValuesViewType sorted_values = ValuesViewType("Copy", + values.dimension_0(), + values.dimension_1(), + values.dimension_2(), + values.dimension_3(), + values.dimension_4(), + values.dimension_5(), + values.dimension_6(), + values.dimension_7()); + + parallel_for(values.dimension_0(), + bin_sort_sort_functor<ValuesViewType, offset_type, + SortImpl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order)); + + deep_copy(values,sorted_values); + } + + // Get the permutation vector + KOKKOS_INLINE_FUNCTION + offset_type get_permute_vector() const { return sort_order;} + + // Get the start offsets for each bin + KOKKOS_INLINE_FUNCTION + offset_type get_bin_offsets() const { return bin_offsets;} + + // Get the count for each bin + KOKKOS_INLINE_FUNCTION + bin_count_type get_bin_count() const {return bin_count_const;} + +public: + KOKKOS_INLINE_FUNCTION + void operator() (const bin_count_tag& tag, const int& i) const { + bin_count_atomic(bin_op.bin(keys,i))++; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const bin_offset_tag& tag, const int& i, value_type& offset, const bool& final) const { + if(final) { + bin_offsets(i) = offset; + } + offset+=bin_count_const(i); + } + + KOKKOS_INLINE_FUNCTION + void operator() (const bin_binning_tag& tag, const int& i) const { + const int bin = bin_op.bin(keys,i); + const int count = bin_count_atomic(bin)++; + + sort_order(bin_offsets(bin) + count) = i; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const bin_sort_bins_tag& tag, const int&i ) const { + bool sorted = false; + int upper_bound = bin_offsets(i)+bin_count_const(i); + while(!sorted) { + sorted = true; + int old_idx = sort_order(bin_offsets(i)); + int new_idx; + for(int k=bin_offsets(i)+1; k<upper_bound; k++) { + new_idx = sort_order(k); + + if(!bin_op(keys_rnd,old_idx,new_idx)) { + sort_order(k-1) = new_idx; + sort_order(k) = old_idx; + sorted = false; + } else { + old_idx = new_idx; + } + } + upper_bound--; + } + } +}; + +namespace SortImpl { + +template<class KeyViewType> +struct DefaultBinOp1D { + const int max_bins_; + const double mul_; + typename KeyViewType::const_value_type range_; + typename KeyViewType::const_value_type min_; + + //Construct BinOp with number of bins, minimum value and maxuimum value + DefaultBinOp1D(int max_bins__, typename KeyViewType::const_value_type min, + typename KeyViewType::const_value_type max ) + :max_bins_(max_bins__+1),mul_(1.0*max_bins__/(max-min)),range_(max-min),min_(min) {} + + //Determine bin index from key value + template<class ViewType> + KOKKOS_INLINE_FUNCTION + int bin(ViewType& keys, const int& i) const { + return int(mul_*(keys(i)-min_)); + } + + //Return maximum bin index + 1 + KOKKOS_INLINE_FUNCTION + int max_bins() const { + return max_bins_; + } + + //Compare to keys within a bin if true new_val will be put before old_val + template<class ViewType, typename iType1, typename iType2> + KOKKOS_INLINE_FUNCTION + bool operator()(ViewType& keys, iType1& i1, iType2& i2) const { + return keys(i1)<keys(i2); + } +}; + +template<class KeyViewType> +struct DefaultBinOp3D { + int max_bins_[3]; + double mul_[3]; + typename KeyViewType::non_const_value_type range_[3]; + typename KeyViewType::non_const_value_type min_[3]; + + DefaultBinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], + typename KeyViewType::const_value_type max[] ) + { + max_bins_[0] = max_bins__[0]+1; + max_bins_[1] = max_bins__[1]+1; + max_bins_[2] = max_bins__[2]+1; + mul_[0] = 1.0*max_bins__[0]/(max[0]-min[0]); + mul_[1] = 1.0*max_bins__[1]/(max[1]-min[1]); + mul_[2] = 1.0*max_bins__[2]/(max[2]-min[2]); + range_[0] = max[0]-min[0]; + range_[1] = max[1]-min[1]; + range_[2] = max[2]-min[2]; + min_[0] = min[0]; + min_[1] = min[1]; + min_[2] = min[2]; + } + + template<class ViewType> + KOKKOS_INLINE_FUNCTION + int bin(ViewType& keys, const int& i) const { + return int( (((int(mul_[0]*(keys(i,0)-min_[0]))*max_bins_[1]) + + int(mul_[1]*(keys(i,1)-min_[1])))*max_bins_[2]) + + int(mul_[2]*(keys(i,2)-min_[2]))); + } + + KOKKOS_INLINE_FUNCTION + int max_bins() const { + return max_bins_[0]*max_bins_[1]*max_bins_[2]; + } + + template<class ViewType, typename iType1, typename iType2> + KOKKOS_INLINE_FUNCTION + bool operator()(ViewType& keys, iType1& i1 , iType2& i2) const { + if (keys(i1,0)>keys(i2,0)) return true; + else if (keys(i1,0)==keys(i2,0)) { + if (keys(i1,1)>keys(i2,1)) return true; + else if (keys(i1,1)==keys(i2,2)) { + if (keys(i1,2)>keys(i2,2)) return true; + } + } + return false; + } +}; + +template<typename Scalar> +struct min_max { + Scalar min; + Scalar max; + bool init; + + KOKKOS_INLINE_FUNCTION + min_max() { + min = 0; + max = 0; + init = 0; + } + + KOKKOS_INLINE_FUNCTION + min_max (const min_max& val) { + min = val.min; + max = val.max; + init = val.init; + } + + KOKKOS_INLINE_FUNCTION + min_max operator = (const min_max& val) { + min = val.min; + max = val.max; + init = val.init; + return *this; + } + + KOKKOS_INLINE_FUNCTION + void operator+= (const Scalar& val) { + if(init) { + min = min<val?min:val; + max = max>val?max:val; + } else { + min = val; + max = val; + init = 1; + } + } + + KOKKOS_INLINE_FUNCTION + void operator+= (const min_max& val) { + if(init && val.init) { + min = min<val.min?min:val.min; + max = max>val.max?max:val.max; + } else { + if(val.init) { + min = val.min; + max = val.max; + init = 1; + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator+= (volatile const Scalar& val) volatile { + if(init) { + min = min<val?min:val; + max = max>val?max:val; + } else { + min = val; + max = val; + init = 1; + } + } + + KOKKOS_INLINE_FUNCTION + void operator+= (volatile const min_max& val) volatile { + if(init && val.init) { + min = min<val.min?min:val.min; + max = max>val.max?max:val.max; + } else { + if(val.init) { + min = val.min; + max = val.max; + init = 1; + } + } + } +}; + + +template<class ViewType> +struct min_max_functor { + typedef typename ViewType::execution_space execution_space; + ViewType view; + typedef min_max<typename ViewType::non_const_value_type> value_type; + min_max_functor (const ViewType view_):view(view_) { + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t& i, value_type& val) const { + val += view(i); + } +}; + +template<class ViewType> +bool try_std_sort(ViewType view) { + bool possible = true; +#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + size_t stride[8]; + view.stride(stride); +#else + size_t stride[8] = { view.stride_0() + , view.stride_1() + , view.stride_2() + , view.stride_3() + , view.stride_4() + , view.stride_5() + , view.stride_6() + , view.stride_7() + }; +#endif + possible = possible && Impl::is_same<typename ViewType::memory_space, HostSpace>::value; + possible = possible && (ViewType::Rank == 1); + possible = possible && (stride[0] == 1); + if(possible) { + std::sort(view.ptr_on_device(),view.ptr_on_device()+view.dimension_0()); + } + return possible; +} + +} + +template<class ViewType> +void sort(ViewType view, bool always_use_kokkos_sort = false) { + if(!always_use_kokkos_sort) { + if(SortImpl::try_std_sort(view)) return; + } + + typedef SortImpl::DefaultBinOp1D<ViewType> CompType; + SortImpl::min_max<typename ViewType::non_const_value_type> val; + parallel_reduce(view.dimension_0(),SortImpl::min_max_functor<ViewType>(view),val); + BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,val.min,val.max),true); + bin_sort.create_permute_vector(); + bin_sort.sort(view); +} + +/*template<class ViewType, class Comparator> +void sort(ViewType view, Comparator comp, bool always_use_kokkos_sort = false) { + +}*/ + +} + +#endif diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..5fc94ac0f82f22cca2e070f8f68f94dd8075a052 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -0,0 +1,92 @@ +KOKKOS_PATH = ../.. + +GTEST_PATH = ../../TPL/gtest + +vpath %.cpp ${KOKKOS_PATH}/algorithms/unit_tests + +default: build_all + echo "End Build" + + +include $(KOKKOS_PATH)/Makefile.kokkos + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + CXX = nvcc_wrapper + CXXFLAGS ?= -O3 + LINK = $(CXX) + LDFLAGS ?= -lpthread +else + CXX ?= g++ + CXXFLAGS ?= -O3 + LINK ?= $(CXX) + LDFLAGS ?= -lpthread +endif + +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests + +TEST_TARGETS = +TARGETS = + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_Cuda + TEST_TARGETS += test-cuda +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_Threads + TEST_TARGETS += test-threads +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + OBJ_OPENMP = TestOpenMP.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_OpenMP + TEST_TARGETS += test-openmp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_Serial + TEST_TARGETS += test-serial +endif + +KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_Cuda + +KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_Threads + +KokkosAlgorithms_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_OpenMP + +KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_Serial + +test-cuda: KokkosAlgorithms_UnitTest_Cuda + ./KokkosAlgorithms_UnitTest_Cuda + +test-threads: KokkosAlgorithms_UnitTest_Threads + ./KokkosAlgorithms_UnitTest_Threads + +test-openmp: KokkosAlgorithms_UnitTest_OpenMP + ./KokkosAlgorithms_UnitTest_OpenMP + +test-serial: KokkosAlgorithms_UnitTest_Serial + ./KokkosAlgorithms_UnitTest_Serial + +build_all: $(TARGETS) + +test: $(TEST_TARGETS) + +clean: kokkos-clean + rm -f *.o $(TARGETS) + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< + +gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc + diff --git a/lib/kokkos/algorithms/unit_tests/TestCuda.cpp b/lib/kokkos/algorithms/unit_tests/TestCuda.cpp new file mode 100755 index 0000000000000000000000000000000000000000..d19c778c4663bff82e50037d2d1b6ffaeeff103d --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestCuda.cpp @@ -0,0 +1,110 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdint.h> +#include <iostream> +#include <iomanip> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#ifdef KOKKOS_HAVE_CUDA + +#include <TestRandom.hpp> +#include <TestSort.hpp> + +namespace Test { + +class cuda : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) ); + } + static void TearDownTestCase() + { + Kokkos::Cuda::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +}; + +void cuda_test_random_xorshift64( int num_draws ) +{ + Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Cuda> >(num_draws); +} + +void cuda_test_random_xorshift1024( int num_draws ) +{ + Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Cuda> >(num_draws); +} + + +#define CUDA_RANDOM_XORSHIFT64( num_draws ) \ + TEST_F( cuda, Random_XorShift64 ) { \ + cuda_test_random_xorshift64(num_draws); \ + } + +#define CUDA_RANDOM_XORSHIFT1024( num_draws ) \ + TEST_F( cuda, Random_XorShift1024 ) { \ + cuda_test_random_xorshift1024(num_draws); \ + } + +#define CUDA_SORT_UNSIGNED( size ) \ + TEST_F( cuda, SortUnsigned ) { \ + Impl::test_sort< Kokkos::Cuda, unsigned >(size); \ + } + +CUDA_RANDOM_XORSHIFT64( 132141141 ) +CUDA_RANDOM_XORSHIFT1024( 52428813 ) +CUDA_SORT_UNSIGNED(171) + +#undef CUDA_RANDOM_XORSHIFT64 +#undef CUDA_RANDOM_XORSHIFT1024 +#undef CUDA_SORT_UNSIGNED +} + +#endif /* #ifdef KOKKOS_HAVE_CUDA */ + diff --git a/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp new file mode 100755 index 0000000000000000000000000000000000000000..4b06dffcb6a068503770229091ab15330bf6af89 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp @@ -0,0 +1,102 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +//---------------------------------------------------------------------------- +#include <TestRandom.hpp> +#include <TestSort.hpp> +#include <iomanip> + +namespace Test { + +#ifdef KOKKOS_HAVE_OPENMP +class openmp : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + + unsigned threads_count = omp_get_max_threads(); + + if ( Kokkos::hwloc::available() ) { + threads_count = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa(); + } + + Kokkos::OpenMP::initialize( threads_count ); + } + + static void TearDownTestCase() + { + Kokkos::OpenMP::finalize(); + } +}; + +#define OPENMP_RANDOM_XORSHIFT64( num_draws ) \ + TEST_F( openmp, Random_XorShift64 ) { \ + Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::OpenMP> >(num_draws); \ + } + +#define OPENMP_RANDOM_XORSHIFT1024( num_draws ) \ + TEST_F( openmp, Random_XorShift1024 ) { \ + Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::OpenMP> >(num_draws); \ + } + +#define OPENMP_SORT_UNSIGNED( size ) \ + TEST_F( openmp, SortUnsigned ) { \ + Impl::test_sort< Kokkos::OpenMP, unsigned >(size); \ + } + +OPENMP_RANDOM_XORSHIFT64( 10240000 ) +OPENMP_RANDOM_XORSHIFT1024( 10130144 ) +OPENMP_SORT_UNSIGNED(171) + +#undef OPENMP_RANDOM_XORSHIFT64 +#undef OPENMP_RANDOM_XORSHIFT1024 +#undef OPENMP_SORT_UNSIGNED +#endif +} // namespace test + diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp new file mode 100755 index 0000000000000000000000000000000000000000..eade74ed93074dc0f25d9a8fcd810ff436fc5523 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -0,0 +1,476 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_DUALVIEW_HPP +#define KOKKOS_TEST_DUALVIEW_HPP + +#include <gtest/gtest.h> +#include <iostream> +#include <cstdlib> +#include <cstdio> +#include <impl/Kokkos_Timer.hpp> +#include <Kokkos_Core.hpp> +#include <Kokkos_Random.hpp> +#include <cmath> + +namespace Test { + +namespace Impl{ + +// This test runs the random number generators and uses some statistic tests to +// check the 'goodness' of the random numbers: +// (i) mean: the mean is expected to be 0.5*RAND_MAX +// (ii) variance: the variance is 1/3*mean*mean +// (iii) covariance: the covariance is 0 +// (iv) 1-tupledistr: the mean, variance and covariance of a 1D Histrogram of random numbers +// (v) 3-tupledistr: the mean, variance and covariance of a 3D Histrogram of random numbers + +#define HIST_DIM3D 24 +#define HIST_DIM1D (HIST_DIM3D*HIST_DIM3D*HIST_DIM3D) + +struct RandomProperties { + uint64_t count; + double mean; + double variance; + double covariance; + double min; + double max; + + KOKKOS_INLINE_FUNCTION + RandomProperties() { + count = 0; + mean = 0.0; + variance = 0.0; + covariance = 0.0; + min = 1e64; + max = -1e64; + } + + KOKKOS_INLINE_FUNCTION + RandomProperties& operator+=(const RandomProperties& add) { + count += add.count; + mean += add.mean; + variance += add.variance; + covariance += add.covariance; + min = add.min<min?add.min:min; + max = add.max>max?add.max:max; + return *this; + } + + KOKKOS_INLINE_FUNCTION + void operator+=(const volatile RandomProperties& add) volatile { + count += add.count; + mean += add.mean; + variance += add.variance; + covariance += add.covariance; + min = add.min<min?add.min:min; + max = add.max>max?add.max:max; + } +}; + +template<class GeneratorPool, class Scalar> +struct test_random_functor { + typedef typename GeneratorPool::generator_type rnd_type; + + typedef RandomProperties value_type; + typedef typename GeneratorPool::device_type device_type; + + GeneratorPool rand_pool; + const double mean; + + // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define + // an exclusive upper bound on the range of random numbers that + // draw() can generate. However, for the float specialization, some + // implementations might violate this upper bound, due to rounding + // error. Just in case, we leave an extra space at the end of each + // dimension, in the View types below. + typedef Kokkos::View<int[HIST_DIM1D+1],typename GeneratorPool::device_type> type_1d; + type_1d density_1d; + typedef Kokkos::View<int[HIST_DIM3D+1][HIST_DIM3D+1][HIST_DIM3D+1],typename GeneratorPool::device_type> type_3d; + type_3d density_3d; + + test_random_functor (GeneratorPool rand_pool_, type_1d d1d, type_3d d3d) : + rand_pool (rand_pool_), + mean (0.5*Kokkos::rand<rnd_type,Scalar>::max ()), + density_1d (d1d), + density_3d (d3d) + {} + + KOKKOS_INLINE_FUNCTION + void operator() (int i, RandomProperties& prop) const { + using Kokkos::atomic_fetch_add; + + rnd_type rand_gen = rand_pool.get_state(); + for (int k = 0; k < 1024; ++k) { + const Scalar tmp = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen); + prop.count++; + prop.mean += tmp; + prop.variance += (tmp-mean)*(tmp-mean); + const Scalar tmp2 = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen); + prop.count++; + prop.mean += tmp2; + prop.variance += (tmp2-mean)*(tmp2-mean); + prop.covariance += (tmp-mean)*(tmp2-mean); + const Scalar tmp3 = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen); + prop.count++; + prop.mean += tmp3; + prop.variance += (tmp3-mean)*(tmp3-mean); + prop.covariance += (tmp2-mean)*(tmp3-mean); + + // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to + // define an exclusive upper bound on the range of random + // numbers that draw() can generate. However, for the float + // specialization, some implementations might violate this upper + // bound, due to rounding error. Just in case, we have left an + // extra space at the end of each dimension of density_1d and + // density_3d. + // + // Please note that those extra entries might not get counted in + // the histograms. However, if Kokkos::rand is broken and only + // returns values of max(), the histograms will still catch this + // indirectly, since none of the other values will be filled in. + + const Scalar theMax = Kokkos::rand<rnd_type, Scalar>::max (); + + const uint64_t ind1_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp / theMax); + const uint64_t ind2_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp2 / theMax); + const uint64_t ind3_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp3 / theMax); + + const uint64_t ind1_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp / theMax); + const uint64_t ind2_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp2 / theMax); + const uint64_t ind3_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp3 / theMax); + + atomic_fetch_add (&density_1d(ind1_1d), 1); + atomic_fetch_add (&density_1d(ind2_1d), 1); + atomic_fetch_add (&density_1d(ind3_1d), 1); + atomic_fetch_add (&density_3d(ind1_3d, ind2_3d, ind3_3d), 1); + } + rand_pool.free_state(rand_gen); + } +}; + +template<class DeviceType> +struct test_histogram1d_functor { + typedef RandomProperties value_type; + typedef typename DeviceType::execution_space execution_space; + typedef typename DeviceType::memory_space memory_space; + + // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define + // an exclusive upper bound on the range of random numbers that + // draw() can generate. However, for the float specialization, some + // implementations might violate this upper bound, due to rounding + // error. Just in case, we leave an extra space at the end of each + // dimension, in the View type below. + typedef Kokkos::View<int[HIST_DIM1D+1], memory_space> type_1d; + type_1d density_1d; + double mean; + + test_histogram1d_functor (type_1d d1d, int num_draws) : + density_1d (d1d), + mean (1.0*num_draws/HIST_DIM1D*3) + { + printf ("Mean: %e\n", mean); + } + + KOKKOS_INLINE_FUNCTION void + operator() (const typename memory_space::size_type i, + RandomProperties& prop) const + { + typedef typename memory_space::size_type size_type; + const double count = density_1d(i); + prop.mean += count; + prop.variance += 1.0 * (count - mean) * (count - mean); + //prop.covariance += 1.0*count*count; + prop.min = count < prop.min ? count : prop.min; + prop.max = count > prop.max ? count : prop.max; + if (i < static_cast<size_type> (HIST_DIM1D-1)) { + prop.covariance += (count - mean) * (density_1d(i+1) - mean); + } + } +}; + +template<class DeviceType> +struct test_histogram3d_functor { + typedef RandomProperties value_type; + typedef typename DeviceType::execution_space execution_space; + typedef typename DeviceType::memory_space memory_space; + + // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define + // an exclusive upper bound on the range of random numbers that + // draw() can generate. However, for the float specialization, some + // implementations might violate this upper bound, due to rounding + // error. Just in case, we leave an extra space at the end of each + // dimension, in the View type below. + typedef Kokkos::View<int[HIST_DIM3D+1][HIST_DIM3D+1][HIST_DIM3D+1], memory_space> type_3d; + type_3d density_3d; + double mean; + + test_histogram3d_functor (type_3d d3d, int num_draws) : + density_3d (d3d), + mean (1.0*num_draws/HIST_DIM1D) + {} + + KOKKOS_INLINE_FUNCTION void + operator() (const typename memory_space::size_type i, + RandomProperties& prop) const + { + typedef typename memory_space::size_type size_type; + const double count = density_3d(i/(HIST_DIM3D*HIST_DIM3D), + (i % (HIST_DIM3D*HIST_DIM3D))/HIST_DIM3D, + i % HIST_DIM3D); + prop.mean += count; + prop.variance += (count - mean) * (count - mean); + if (i < static_cast<size_type> (HIST_DIM1D-1)) { + const double count_next = density_3d((i+1)/(HIST_DIM3D*HIST_DIM3D), + ((i+1)%(HIST_DIM3D*HIST_DIM3D))/HIST_DIM3D, + (i+1)%HIST_DIM3D); + prop.covariance += (count - mean) * (count_next - mean); + } + } +}; + +// +// Templated test that uses the above functors. +// +template <class RandomGenerator,class Scalar> +struct test_random_scalar { + typedef typename RandomGenerator::generator_type rnd_type; + + int pass_mean,pass_var,pass_covar; + int pass_hist1d_mean,pass_hist1d_var,pass_hist1d_covar; + int pass_hist3d_mean,pass_hist3d_var,pass_hist3d_covar; + + test_random_scalar (typename test_random_functor<RandomGenerator,int>::type_1d& density_1d, + typename test_random_functor<RandomGenerator,int>::type_3d& density_3d, + RandomGenerator& pool, + unsigned int num_draws) + { + using std::cerr; + using std::endl; + using Kokkos::parallel_reduce; + + { + cerr << " -- Testing randomness properties" << endl; + + RandomProperties result; + typedef test_random_functor<RandomGenerator, Scalar> functor_type; + parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result); + + //printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2); + double tolerance = 2.0*sqrt(1.0/num_draws); + double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max(); + double variance_expect = 1.0/3.0*mean_expect*mean_expect; + double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0; + double variance_eps = variance_expect/(result.variance/num_draws/3)-1.0; + double covariance_eps = result.covariance/num_draws/2/variance_expect; + pass_mean = ((-tolerance < mean_eps) && + ( tolerance > mean_eps)) ? 1:0; + pass_var = ((-tolerance < variance_eps) && + ( tolerance > variance_eps)) ? 1:0; + pass_covar = ((-1.4*tolerance < covariance_eps) && + ( 1.4*tolerance > covariance_eps)) ? 1:0; + cerr << "Pass: " << pass_mean + << " " << pass_var + << " " << mean_eps + << " " << variance_eps + << " " << covariance_eps + << " || " << tolerance << endl; + } + { + cerr << " -- Testing 1-D histogram" << endl; + + RandomProperties result; + typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type; + parallel_reduce (HIST_DIM1D, functor_type (density_1d, num_draws), result); + + double tolerance = 6*sqrt(1.0/HIST_DIM1D); + double mean_expect = 1.0*num_draws*3/HIST_DIM1D; + double variance_expect = 1.0*num_draws*3/HIST_DIM1D*(1.0-1.0/HIST_DIM1D); + double covariance_expect = -1.0*num_draws*3/HIST_DIM1D/HIST_DIM1D; + double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0; + double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0; + double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect; + pass_hist1d_mean = ((-tolerance < mean_eps) && + ( tolerance > mean_eps)) ? 1:0; + pass_hist1d_var = ((-tolerance < variance_eps) && + ( tolerance > variance_eps)) ? 1:0; + pass_hist1d_covar = ((-tolerance < covariance_eps) && + ( tolerance > covariance_eps)) ? 1:0; + + cerr << "Density 1D: " << mean_eps + << " " << variance_eps + << " " << (result.covariance/HIST_DIM1D/HIST_DIM1D) + << " || " << tolerance + << " " << result.min + << " " << result.max + << " || " << result.variance/HIST_DIM1D + << " " << 1.0*num_draws*3/HIST_DIM1D*(1.0-1.0/HIST_DIM1D) + << " || " << result.covariance/HIST_DIM1D + << " " << -1.0*num_draws*3/HIST_DIM1D/HIST_DIM1D + << endl; + } + { + cerr << " -- Testing 3-D histogram" << endl; + + RandomProperties result; + typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type; + parallel_reduce (HIST_DIM1D, functor_type (density_3d, num_draws), result); + + double tolerance = 6*sqrt(1.0/HIST_DIM1D); + double mean_expect = 1.0*num_draws/HIST_DIM1D; + double variance_expect = 1.0*num_draws/HIST_DIM1D*(1.0-1.0/HIST_DIM1D); + double covariance_expect = -1.0*num_draws/HIST_DIM1D/HIST_DIM1D; + double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0; + double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0; + double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect; + pass_hist3d_mean = ((-tolerance < mean_eps) && + ( tolerance > mean_eps)) ? 1:0; + pass_hist3d_var = ((-tolerance < variance_eps) && + ( tolerance > variance_eps)) ? 1:0; + pass_hist3d_covar = ((-tolerance < covariance_eps) && + ( tolerance > covariance_eps)) ? 1:0; + + cerr << "Density 3D: " << mean_eps + << " " << variance_eps + << " " << result.covariance/HIST_DIM1D/HIST_DIM1D + << " || " << tolerance + << " " << result.min + << " " << result.max << endl; + } + } +}; + +template <class RandomGenerator> +void test_random(unsigned int num_draws) +{ + using std::cerr; + using std::endl; + typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d"); + typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d"); + + cerr << "Test Scalar=int" << endl; + RandomGenerator pool(31891); + test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws); + ASSERT_EQ( test_int.pass_mean,1); + ASSERT_EQ( test_int.pass_var,1); + ASSERT_EQ( test_int.pass_covar,1); + ASSERT_EQ( test_int.pass_hist1d_mean,1); + ASSERT_EQ( test_int.pass_hist1d_var,1); + ASSERT_EQ( test_int.pass_hist1d_covar,1); + ASSERT_EQ( test_int.pass_hist3d_mean,1); + ASSERT_EQ( test_int.pass_hist3d_var,1); + ASSERT_EQ( test_int.pass_hist3d_covar,1); + deep_copy(density_1d,0); + deep_copy(density_3d,0); + + cerr << "Test Scalar=unsigned int" << endl; + test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws); + ASSERT_EQ( test_uint.pass_mean,1); + ASSERT_EQ( test_uint.pass_var,1); + ASSERT_EQ( test_uint.pass_covar,1); + ASSERT_EQ( test_uint.pass_hist1d_mean,1); + ASSERT_EQ( test_uint.pass_hist1d_var,1); + ASSERT_EQ( test_uint.pass_hist1d_covar,1); + ASSERT_EQ( test_uint.pass_hist3d_mean,1); + ASSERT_EQ( test_uint.pass_hist3d_var,1); + ASSERT_EQ( test_uint.pass_hist3d_covar,1); + deep_copy(density_1d,0); + deep_copy(density_3d,0); + + cerr << "Test Scalar=int64_t" << endl; + test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws); + ASSERT_EQ( test_int64.pass_mean,1); + ASSERT_EQ( test_int64.pass_var,1); + ASSERT_EQ( test_int64.pass_covar,1); + ASSERT_EQ( test_int64.pass_hist1d_mean,1); + ASSERT_EQ( test_int64.pass_hist1d_var,1); + ASSERT_EQ( test_int64.pass_hist1d_covar,1); + ASSERT_EQ( test_int64.pass_hist3d_mean,1); + ASSERT_EQ( test_int64.pass_hist3d_var,1); + ASSERT_EQ( test_int64.pass_hist3d_covar,1); + deep_copy(density_1d,0); + deep_copy(density_3d,0); + + cerr << "Test Scalar=uint64_t" << endl; + test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws); + ASSERT_EQ( test_uint64.pass_mean,1); + ASSERT_EQ( test_uint64.pass_var,1); + ASSERT_EQ( test_uint64.pass_covar,1); + ASSERT_EQ( test_uint64.pass_hist1d_mean,1); + ASSERT_EQ( test_uint64.pass_hist1d_var,1); + ASSERT_EQ( test_uint64.pass_hist1d_covar,1); + ASSERT_EQ( test_uint64.pass_hist3d_mean,1); + ASSERT_EQ( test_uint64.pass_hist3d_var,1); + ASSERT_EQ( test_uint64.pass_hist3d_covar,1); + deep_copy(density_1d,0); + deep_copy(density_3d,0); + + cerr << "Test Scalar=float" << endl; + test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws); + ASSERT_EQ( test_float.pass_mean,1); + ASSERT_EQ( test_float.pass_var,1); + ASSERT_EQ( test_float.pass_covar,1); + ASSERT_EQ( test_float.pass_hist1d_mean,1); + ASSERT_EQ( test_float.pass_hist1d_var,1); + ASSERT_EQ( test_float.pass_hist1d_covar,1); + ASSERT_EQ( test_float.pass_hist3d_mean,1); + ASSERT_EQ( test_float.pass_hist3d_var,1); + ASSERT_EQ( test_float.pass_hist3d_covar,1); + deep_copy(density_1d,0); + deep_copy(density_3d,0); + + cerr << "Test Scalar=double" << endl; + test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws); + ASSERT_EQ( test_double.pass_mean,1); + ASSERT_EQ( test_double.pass_var,1); + ASSERT_EQ( test_double.pass_covar,1); + ASSERT_EQ( test_double.pass_hist1d_mean,1); + ASSERT_EQ( test_double.pass_hist1d_var,1); + ASSERT_EQ( test_double.pass_hist1d_covar,1); + ASSERT_EQ( test_double.pass_hist3d_mean,1); + ASSERT_EQ( test_double.pass_hist3d_var,1); + ASSERT_EQ( test_double.pass_hist3d_covar,1); +} +} + +} // namespace Test + +#endif //KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/lib/kokkos/algorithms/unit_tests/TestSerial.cpp b/lib/kokkos/algorithms/unit_tests/TestSerial.cpp new file mode 100755 index 0000000000000000000000000000000000000000..741cf97ae13f245fafeb95078222943afda8ed1d --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestSerial.cpp @@ -0,0 +1,99 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <TestRandom.hpp> +#include <TestSort.hpp> +#include <iomanip> + + +//---------------------------------------------------------------------------- + + +namespace Test { + +#ifdef KOKKOS_HAVE_SERIAL +class serial : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision (5) << std::scientific; + Kokkos::Serial::initialize (); + } + + static void TearDownTestCase () + { + Kokkos::Serial::finalize (); + } +}; + +#define SERIAL_RANDOM_XORSHIFT64( num_draws ) \ + TEST_F( serial, Random_XorShift64 ) { \ + Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Serial> >(num_draws); \ + } + +#define SERIAL_RANDOM_XORSHIFT1024( num_draws ) \ + TEST_F( serial, Random_XorShift1024 ) { \ + Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Serial> >(num_draws); \ + } + +#define SERIAL_SORT_UNSIGNED( size ) \ + TEST_F( serial, SortUnsigned ) { \ + Impl::test_sort< Kokkos::Serial, unsigned >(size); \ + } + +SERIAL_RANDOM_XORSHIFT64( 10240000 ) +SERIAL_RANDOM_XORSHIFT1024( 10130144 ) +SERIAL_SORT_UNSIGNED(171) + +#undef SERIAL_RANDOM_XORSHIFT64 +#undef SERIAL_RANDOM_XORSHIFT1024 +#undef SERIAL_SORT_UNSIGNED + +#endif // KOKKOS_HAVE_SERIAL +} // namespace Test + + diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp new file mode 100755 index 0000000000000000000000000000000000000000..ccbcbdd0011bbc577ac8c39b2f593ed35f2546ac --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp @@ -0,0 +1,206 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef TESTSORT_HPP_ +#define TESTSORT_HPP_ + +#include <gtest/gtest.h> +#include<Kokkos_Core.hpp> +#include<Kokkos_Random.hpp> +#include<Kokkos_Sort.hpp> + +namespace Test { + +namespace Impl{ + +template<class ExecutionSpace, class Scalar> +struct is_sorted_struct { + typedef unsigned int value_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<Scalar*,ExecutionSpace> keys; + + is_sorted_struct(Kokkos::View<Scalar*,ExecutionSpace> keys_):keys(keys_) {} + KOKKOS_INLINE_FUNCTION + void operator() (int i, unsigned int& count) const { + if(keys(i)>keys(i+1)) count++; + } +}; + +template<class ExecutionSpace, class Scalar> +struct sum { + typedef double value_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<Scalar*,ExecutionSpace> keys; + + sum(Kokkos::View<Scalar*,ExecutionSpace> keys_):keys(keys_) {} + KOKKOS_INLINE_FUNCTION + void operator() (int i, double& count) const { + count+=keys(i); + } +}; + +template<class ExecutionSpace, class Scalar> +struct bin3d_is_sorted_struct { + typedef unsigned int value_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<Scalar*[3],ExecutionSpace> keys; + + int max_bins; + Scalar min; + Scalar max; + + bin3d_is_sorted_struct(Kokkos::View<Scalar*[3],ExecutionSpace> keys_,int max_bins_,Scalar min_,Scalar max_): + keys(keys_),max_bins(max_bins_),min(min_),max(max_) { + } + KOKKOS_INLINE_FUNCTION + void operator() (int i, unsigned int& count) const { + int ix1 = int ((keys(i,0)-min)/max * max_bins); + int iy1 = int ((keys(i,1)-min)/max * max_bins); + int iz1 = int ((keys(i,2)-min)/max * max_bins); + int ix2 = int ((keys(i+1,0)-min)/max * max_bins); + int iy2 = int ((keys(i+1,1)-min)/max * max_bins); + int iz2 = int ((keys(i+1,2)-min)/max * max_bins); + + if (ix1>ix2) count++; + else if(ix1==ix2) { + if (iy1>iy2) count++; + else if ((iy1==iy2) && (iz1>iz2)) count++; + } + } +}; + +template<class ExecutionSpace, class Scalar> +struct sum3D { + typedef double value_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<Scalar*[3],ExecutionSpace> keys; + + sum3D(Kokkos::View<Scalar*[3],ExecutionSpace> keys_):keys(keys_) {} + KOKKOS_INLINE_FUNCTION + void operator() (int i, double& count) const { + count+=keys(i,0); + count+=keys(i,1); + count+=keys(i,2); + } +}; + +template<class ExecutionSpace, typename KeyType> +void test_1D_sort(unsigned int n,bool force_kokkos) { + typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType; + KeyViewType keys("Keys",n); + + Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931); + Kokkos::fill_random(keys,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND); + + double sum_before = 0.0; + double sum_after = 0.0; + unsigned int sort_fails = 0; + + Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys),sum_before); + + Kokkos::sort(keys,force_kokkos); + + Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys),sum_after); + Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys),sort_fails); + + double ratio = sum_before/sum_after; + double epsilon = 1e-10; + unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0; + + ASSERT_EQ(sort_fails,0); + ASSERT_EQ(equal_sum,1); +} + +template<class ExecutionSpace, typename KeyType> +void test_3D_sort(unsigned int n) { + typedef Kokkos::View<KeyType*[3],ExecutionSpace > KeyViewType; + + KeyViewType keys("Keys",n*n*n); + + Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931); + Kokkos::fill_random(keys,g,100.0); + + double sum_before = 0.0; + double sum_after = 0.0; + unsigned int sort_fails = 0; + + Kokkos::parallel_reduce(keys.dimension_0(),sum3D<ExecutionSpace, KeyType>(keys),sum_before); + + int bin_1d = 1; + while( bin_1d*bin_1d*bin_1d*4< (int) keys.dimension_0() ) bin_1d*=2; + int bin_max[3] = {bin_1d,bin_1d,bin_1d}; + typename KeyViewType::value_type min[3] = {0,0,0}; + typename KeyViewType::value_type max[3] = {100,100,100}; + + typedef Kokkos::SortImpl::DefaultBinOp3D< KeyViewType > BinOp; + BinOp bin_op(bin_max,min,max); + Kokkos::BinSort< KeyViewType , BinOp > + Sorter(keys,bin_op,false); + Sorter.create_permute_vector(); + Sorter.template sort< KeyViewType >(keys); + + Kokkos::parallel_reduce(keys.dimension_0(),sum3D<ExecutionSpace, KeyType>(keys),sum_after); + Kokkos::parallel_reduce(keys.dimension_0()-1,bin3d_is_sorted_struct<ExecutionSpace, KeyType>(keys,bin_1d,min[0],max[0]),sort_fails); + + double ratio = sum_before/sum_after; + double epsilon = 1e-10; + unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0; + + printf("3D Sort Sum: %f %f Fails: %u\n",sum_before,sum_after,sort_fails); + ASSERT_EQ(sort_fails,0); + ASSERT_EQ(equal_sum,1); +} + +template<class ExecutionSpace, typename KeyType> +void test_sort(unsigned int N) +{ + test_1D_sort<ExecutionSpace,KeyType>(N*N*N, true); + test_1D_sort<ExecutionSpace,KeyType>(N*N*N, false); + test_3D_sort<ExecutionSpace,KeyType>(N); +} + +} +} +#endif /* TESTSORT_HPP_ */ diff --git a/lib/kokkos/algorithms/unit_tests/TestThreads.cpp b/lib/kokkos/algorithms/unit_tests/TestThreads.cpp new file mode 100755 index 0000000000000000000000000000000000000000..a61d6c8bd59bb9758f7ff30124b048150ac0cb92 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestThreads.cpp @@ -0,0 +1,113 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <TestRandom.hpp> +#include <TestSort.hpp> +#include <iomanip> + + +//---------------------------------------------------------------------------- + + +namespace Test { + +#ifdef KOKKOS_HAVE_PTHREAD +class threads : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + + unsigned num_threads = 4; + + if (Kokkos::hwloc::available()) { + num_threads = Kokkos::hwloc::get_available_numa_count() + * Kokkos::hwloc::get_available_cores_per_numa() + // * Kokkos::hwloc::get_available_threads_per_core() + ; + + } + + std::cout << "Threads: " << num_threads << std::endl; + + Kokkos::Threads::initialize( num_threads ); + } + + static void TearDownTestCase() + { + Kokkos::Threads::finalize(); + } +}; + +#define THREADS_RANDOM_XORSHIFT64( num_draws ) \ + TEST_F( threads, Random_XorShift64 ) { \ + Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Threads> >(num_draws); \ + } + +#define THREADS_RANDOM_XORSHIFT1024( num_draws ) \ + TEST_F( threads, Random_XorShift1024 ) { \ + Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Threads> >(num_draws); \ + } + +#define THREADS_SORT_UNSIGNED( size ) \ + TEST_F( threads, SortUnsigned ) { \ + Impl::test_sort< Kokkos::Threads, double >(size); \ + } + + +THREADS_RANDOM_XORSHIFT64( 10240000 ) +THREADS_RANDOM_XORSHIFT1024( 10130144 ) +THREADS_SORT_UNSIGNED(171) + +#undef THREADS_RANDOM_XORSHIFT64 +#undef THREADS_RANDOM_XORSHIFT1024 +#undef THREADS_SORT_UNSIGNED + +#endif +} // namespace Test + + diff --git a/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp new file mode 100755 index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp @@ -0,0 +1,50 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +int main(int argc, char *argv[]) { + ::testing::InitGoogleTest(&argc,argv); + return RUN_ALL_TESTS(); +} + diff --git a/lib/kokkos/containers/performance_tests/Makefile b/lib/kokkos/containers/performance_tests/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..7ced9452826bc72cd957d7a1943bf55f9f01285d --- /dev/null +++ b/lib/kokkos/containers/performance_tests/Makefile @@ -0,0 +1,81 @@ +KOKKOS_PATH = ../.. + +GTEST_PATH = ../../TPL/gtest + +vpath %.cpp ${KOKKOS_PATH}/containers/performance_tests + +default: build_all + echo "End Build" + + +include $(KOKKOS_PATH)/Makefile.kokkos + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + CXX = nvcc_wrapper + CXXFLAGS ?= -O3 + LINK = $(CXX) + LDFLAGS ?= -lpthread +else + CXX ?= g++ + CXXFLAGS ?= -O3 + LINK ?= $(CXX) + LDFLAGS ?= -lpthread +endif + +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/performance_tests + +TEST_TARGETS = +TARGETS = + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + OBJ_CUDA = TestCuda.o TestMain.o gtest-all.o + TARGETS += KokkosContainers_PerformanceTest_Cuda + TEST_TARGETS += test-cuda +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + OBJ_THREADS = TestThreads.o TestMain.o gtest-all.o + TARGETS += KokkosContainers_PerformanceTest_Threads + TEST_TARGETS += test-threads +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + OBJ_OPENMP = TestOpenMP.o TestMain.o gtest-all.o + TARGETS += KokkosContainers_PerformanceTest_OpenMP + TEST_TARGETS += test-openmp +endif + +KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda + +KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads + +KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_OpenMP + +test-cuda: KokkosContainers_PerformanceTest_Cuda + ./KokkosContainers_PerformanceTest_Cuda + +test-threads: KokkosContainers_PerformanceTest_Threads + ./KokkosContainers_PerformanceTest_Threads + +test-openmp: KokkosContainers_PerformanceTest_OpenMP + ./KokkosContainers_PerformanceTest_OpenMP + + +build_all: $(TARGETS) + +test: $(TEST_TARGETS) + +clean: kokkos-clean + rm -f *.o $(TARGETS) + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< + +gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc + diff --git a/lib/kokkos/containers/performance_tests/TestCuda.cpp b/lib/kokkos/containers/performance_tests/TestCuda.cpp new file mode 100755 index 0000000000000000000000000000000000000000..aee262de93eecfe79314e217252bbcd15a847353 --- /dev/null +++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp @@ -0,0 +1,100 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdint.h> +#include <string> +#include <iostream> +#include <iomanip> +#include <sstream> +#include <fstream> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if defined( KOKKOS_HAVE_CUDA ) + +#include <Kokkos_UnorderedMap.hpp> + +#include <TestGlobal2LocalIds.hpp> + +#include <TestUnorderedMapPerformance.hpp> + +namespace Performance { + +class cuda : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) ); + } + static void TearDownTestCase() + { + Kokkos::Cuda::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +}; + +TEST_F( cuda, global_2_local) +{ + std::cout << "Cuda" << std::endl; + std::cout << "size, create, generate, fill, find" << std::endl; + for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step) + test_global_to_local_ids<Kokkos::Cuda>(i); +} + +TEST_F( cuda, unordered_map_performance_near) +{ + Perf::run_performance_tests<Kokkos::Cuda,true>("cuda-near"); +} + +TEST_F( cuda, unordered_map_performance_far) +{ + Perf::run_performance_tests<Kokkos::Cuda,false>("cuda-far"); +} + +} + +#endif /* #if defined( KOKKOS_HAVE_CUDA ) */ diff --git a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp new file mode 100755 index 0000000000000000000000000000000000000000..fb70b8fe2e9dc8c0a0cc5ed6787b8afa86e666df --- /dev/null +++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp @@ -0,0 +1,231 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP +#define KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP + +#include <Kokkos_Core.hpp> +#include <Kokkos_UnorderedMap.hpp> +#include <vector> +#include <algorithm> + +#include <impl/Kokkos_Timer.hpp> + +// This test will simulate global ids + +namespace Performance { + +static const unsigned begin_id_size = 256u; +static const unsigned end_id_size = 1u << 22; +static const unsigned id_step = 2u; + +union helper +{ + uint32_t word; + uint8_t byte[4]; +}; + + +template <typename Device> +struct generate_ids +{ + typedef Device execution_space; + typedef typename execution_space::size_type size_type; + typedef Kokkos::View<uint32_t*,execution_space> local_id_view; + + local_id_view local_2_global; + + generate_ids( local_id_view & ids) + : local_2_global(ids) + { + Kokkos::parallel_for(local_2_global.dimension_0(), *this); + } + + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i) const + { + + helper x = {static_cast<uint32_t>(i)}; + + // shuffle the bytes of i to create a unique, semi-random global_id + x.word = ~x.word; + + uint8_t tmp = x.byte[3]; + x.byte[3] = x.byte[1]; + x.byte[1] = tmp; + + tmp = x.byte[2]; + x.byte[2] = x.byte[0]; + x.byte[0] = tmp; + + local_2_global[i] = x.word; + } + +}; + +template <typename Device> +struct fill_map +{ + typedef Device execution_space; + typedef typename execution_space::size_type size_type; + typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view; + typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view; + + global_id_view global_2_local; + local_id_view local_2_global; + + fill_map( global_id_view gIds, local_id_view lIds) + : global_2_local(gIds) , local_2_global(lIds) + { + Kokkos::parallel_for(local_2_global.dimension_0(), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i) const + { + global_2_local.insert( local_2_global[i], i); + } + +}; + +template <typename Device> +struct find_test +{ + typedef Device execution_space; + typedef typename execution_space::size_type size_type; + typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view; + typedef Kokkos::UnorderedMap<const uint32_t, const size_type,execution_space> global_id_view; + + global_id_view global_2_local; + local_id_view local_2_global; + + typedef size_t value_type; + + find_test( global_id_view gIds, local_id_view lIds, value_type & num_errors) + : global_2_local(gIds) , local_2_global(lIds) + { + Kokkos::parallel_reduce(local_2_global.dimension_0(), *this, num_errors); + } + + KOKKOS_INLINE_FUNCTION + void init(value_type & v) const + { v = 0; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type & dst, volatile value_type const & src) const + { dst += src; } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i, value_type & num_errors) const + { + uint32_t index = global_2_local.find( local_2_global[i] ); + + if ( global_2_local.value_at(index) != i) ++num_errors; + } + +}; + +template <typename Device> +void test_global_to_local_ids(unsigned num_ids) +{ + + typedef Device execution_space; + typedef typename execution_space::size_type size_type; + + typedef Kokkos::View<uint32_t*,execution_space> local_id_view; + typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view; + + //size + std::cout << num_ids << ", "; + + double elasped_time = 0; + Kokkos::Impl::Timer timer; + + local_id_view local_2_global("local_ids", num_ids); + global_id_view global_2_local((3u*num_ids)/2u); + + //create + elasped_time = timer.seconds(); + std::cout << elasped_time << ", "; + timer.reset(); + + // generate unique ids + { + generate_ids<Device> gen(local_2_global); + } + Device::fence(); + // generate + elasped_time = timer.seconds(); + std::cout << elasped_time << ", "; + timer.reset(); + + { + fill_map<Device> fill(global_2_local, local_2_global); + } + Device::fence(); + + // fill + elasped_time = timer.seconds(); + std::cout << elasped_time << ", "; + timer.reset(); + + + size_t num_errors = 0; + for (int i=0; i<100; ++i) + { + find_test<Device> find(global_2_local, local_2_global,num_errors); + } + Device::fence(); + + // find + elasped_time = timer.seconds(); + std::cout << elasped_time << std::endl; + + ASSERT_EQ( num_errors, 0u); +} + + +} // namespace Performance + + +#endif //KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP + diff --git a/lib/kokkos/containers/performance_tests/TestMain.cpp b/lib/kokkos/containers/performance_tests/TestMain.cpp new file mode 100755 index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87 --- /dev/null +++ b/lib/kokkos/containers/performance_tests/TestMain.cpp @@ -0,0 +1,50 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +int main(int argc, char *argv[]) { + ::testing::InitGoogleTest(&argc,argv); + return RUN_ALL_TESTS(); +} + diff --git a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp new file mode 100755 index 0000000000000000000000000000000000000000..82a9311df71108d2f05b6020a31764f91be36600 --- /dev/null +++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp @@ -0,0 +1,131 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <Kokkos_UnorderedMap.hpp> + +#include <TestGlobal2LocalIds.hpp> +#include <TestUnorderedMapPerformance.hpp> + +#include <iomanip> +#include <sstream> +#include <string> +#include <fstream> + + +namespace Performance { + +class openmp : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + + unsigned num_threads = 4; + + if (Kokkos::hwloc::available()) { + num_threads = Kokkos::hwloc::get_available_numa_count() + * Kokkos::hwloc::get_available_cores_per_numa() + * Kokkos::hwloc::get_available_threads_per_core() + ; + + } + + std::cout << "OpenMP: " << num_threads << std::endl; + + Kokkos::OpenMP::initialize( num_threads ); + + std::cout << "available threads: " << omp_get_max_threads() << std::endl; + } + + static void TearDownTestCase() + { + Kokkos::OpenMP::finalize(); + + omp_set_num_threads(1); + + ASSERT_EQ( 1 , omp_get_max_threads() ); + } +}; + +TEST_F( openmp, global_2_local) +{ + std::cout << "OpenMP" << std::endl; + std::cout << "size, create, generate, fill, find" << std::endl; + for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step) + test_global_to_local_ids<Kokkos::OpenMP>(i); +} + +TEST_F( openmp, unordered_map_performance_near) +{ + unsigned num_openmp = 4; + if (Kokkos::hwloc::available()) { + num_openmp = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + + } + std::ostringstream base_file_name; + base_file_name << "openmp-" << num_openmp << "-near"; + Perf::run_performance_tests<Kokkos::OpenMP,true>(base_file_name.str()); +} + +TEST_F( openmp, unordered_map_performance_far) +{ + unsigned num_openmp = 4; + if (Kokkos::hwloc::available()) { + num_openmp = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + + } + std::ostringstream base_file_name; + base_file_name << "openmp-" << num_openmp << "-far"; + Perf::run_performance_tests<Kokkos::OpenMP,false>(base_file_name.str()); +} + +} // namespace test + diff --git a/lib/kokkos/containers/performance_tests/TestThreads.cpp b/lib/kokkos/containers/performance_tests/TestThreads.cpp new file mode 100755 index 0000000000000000000000000000000000000000..04d9dc0c187f1006c563e84d55b16780485daec7 --- /dev/null +++ b/lib/kokkos/containers/performance_tests/TestThreads.cpp @@ -0,0 +1,126 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <Kokkos_UnorderedMap.hpp> + +#include <iomanip> + +#include <TestGlobal2LocalIds.hpp> +#include <TestUnorderedMapPerformance.hpp> + +#include <iomanip> +#include <sstream> +#include <string> +#include <fstream> + +namespace Performance { + +class threads : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + + unsigned num_threads = 4; + + if (Kokkos::hwloc::available()) { + num_threads = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + + } + + std::cout << "Threads: " << num_threads << std::endl; + + Kokkos::Threads::initialize( num_threads ); + } + + static void TearDownTestCase() + { + Kokkos::Threads::finalize(); + } +}; + +TEST_F( threads, global_2_local) +{ + std::cout << "Threads" << std::endl; + std::cout << "size, create, generate, fill, find" << std::endl; + for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step) + test_global_to_local_ids<Kokkos::Threads>(i); +} + +TEST_F( threads, unordered_map_performance_near) +{ + unsigned num_threads = 4; + if (Kokkos::hwloc::available()) { + num_threads = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + + } + std::ostringstream base_file_name; + base_file_name << "threads-" << num_threads << "-near"; + Perf::run_performance_tests<Kokkos::Threads,true>(base_file_name.str()); +} + +TEST_F( threads, unordered_map_performance_far) +{ + unsigned num_threads = 4; + if (Kokkos::hwloc::available()) { + num_threads = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + + } + std::ostringstream base_file_name; + base_file_name << "threads-" << num_threads << "-far"; + Perf::run_performance_tests<Kokkos::Threads,false>(base_file_name.str()); +} + +} // namespace Performance + + diff --git a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp new file mode 100755 index 0000000000000000000000000000000000000000..975800229cbcb67c6e7e788842a3db06d97f0a21 --- /dev/null +++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp @@ -0,0 +1,262 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP +#define KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP + +#include <impl/Kokkos_Timer.hpp> + +#include <iostream> +#include <iomanip> +#include <fstream> +#include <string> +#include <sstream> + + +namespace Perf { + +template <typename Device, bool Near> +struct UnorderedMapTest +{ + typedef Device execution_space; + typedef Kokkos::UnorderedMap<uint32_t, uint32_t, execution_space> map_type; + typedef typename map_type::histogram_type histogram_type; + + struct value_type { + uint32_t failed_count; + uint32_t max_list; + }; + + uint32_t capacity; + uint32_t inserts; + uint32_t collisions; + double seconds; + map_type map; + histogram_type histogram; + + UnorderedMapTest( uint32_t arg_capacity, uint32_t arg_inserts, uint32_t arg_collisions) + : capacity(arg_capacity) + , inserts(arg_inserts) + , collisions(arg_collisions) + , seconds(0) + , map(capacity) + , histogram(map.get_histogram()) + { + Kokkos::Impl::Timer wall_clock ; + wall_clock.reset(); + + value_type v = {}; + int loop_count = 0; + do { + ++loop_count; + + v = value_type(); + Kokkos::parallel_reduce(inserts, *this, v); + + if (v.failed_count > 0u) { + const uint32_t new_capacity = map.capacity() + ((map.capacity()*3ull)/20u) + v.failed_count/collisions ; + map.rehash( new_capacity ); + } + } while (v.failed_count > 0u); + + seconds = wall_clock.seconds(); + + switch (loop_count) + { + case 1u: std::cout << " \033[0;32m" << loop_count << "\033[0m "; break; + case 2u: std::cout << " \033[1;31m" << loop_count << "\033[0m "; break; + default: std::cout << " \033[0;31m" << loop_count << "\033[0m "; break; + } + std::cout << std::setprecision(2) << std::fixed << std::setw(5) << (1e9*(seconds/(inserts))) << "; " << std::flush; + + histogram.calculate(); + Device::fence(); + } + + void print(std::ostream & metrics_out, std::ostream & length_out, std::ostream & distance_out, std::ostream & block_distance_out) + { + metrics_out << map.capacity() << " , "; + metrics_out << inserts/collisions << " , "; + metrics_out << (100.0 * inserts/collisions) / map.capacity() << " , "; + metrics_out << inserts << " , "; + metrics_out << (map.failed_insert() ? "true" : "false") << " , "; + metrics_out << collisions << " , "; + metrics_out << 1e9*(seconds/inserts) << " , "; + metrics_out << seconds << std::endl; + + length_out << map.capacity() << " , "; + length_out << ((100.0 *inserts/collisions) / map.capacity()) << " , "; + length_out << collisions << " , "; + histogram.print_length(length_out); + + distance_out << map.capacity() << " , "; + distance_out << ((100.0 *inserts/collisions) / map.capacity()) << " , "; + distance_out << collisions << " , "; + histogram.print_distance(distance_out); + + block_distance_out << map.capacity() << " , "; + block_distance_out << ((100.0 *inserts/collisions) / map.capacity()) << " , "; + block_distance_out << collisions << " , "; + histogram.print_block_distance(block_distance_out); + } + + + KOKKOS_INLINE_FUNCTION + void init( value_type & v ) const + { + v.failed_count = 0; + v.max_list = 0; + } + + KOKKOS_INLINE_FUNCTION + void join( volatile value_type & dst, const volatile value_type & src ) const + { + dst.failed_count += src.failed_count; + dst.max_list = src.max_list < dst.max_list ? dst.max_list : src.max_list; + } + + KOKKOS_INLINE_FUNCTION + void operator()(uint32_t i, value_type & v) const + { + const uint32_t key = Near ? i/collisions : i%(inserts/collisions); + typename map_type::insert_result result = map.insert(key,i); + v.failed_count += !result.failed() ? 0 : 1; + v.max_list = result.list_position() < v.max_list ? v.max_list : result.list_position(); + } + +}; + +//#define KOKKOS_COLLECT_UNORDERED_MAP_METRICS + +template <typename Device, bool Near> +void run_performance_tests(std::string const & base_file_name) +{ +#if defined(KOKKOS_COLLECT_UNORDERED_MAP_METRICS) + std::string metrics_file_name = base_file_name + std::string("-metrics.csv"); + std::string length_file_name = base_file_name + std::string("-length.csv"); + std::string distance_file_name = base_file_name + std::string("-distance.csv"); + std::string block_distance_file_name = base_file_name + std::string("-block_distance.csv"); + + std::ofstream metrics_out( metrics_file_name.c_str(), std::ofstream::out ); + std::ofstream length_out( length_file_name.c_str(), std::ofstream::out ); + std::ofstream distance_out( distance_file_name.c_str(), std::ofstream::out ); + std::ofstream block_distance_out( block_distance_file_name.c_str(), std::ofstream::out ); + + + /* + const double test_ratios[] = { + 0.50 + , 0.75 + , 0.80 + , 0.85 + , 0.90 + , 0.95 + , 1.00 + , 1.25 + , 2.00 + }; + */ + + const double test_ratios[] = { 1.00 }; + + const int num_ratios = sizeof(test_ratios) / sizeof(double); + + /* + const uint32_t collisions[] { + 1 + , 4 + , 16 + , 64 + }; + */ + + const uint32_t collisions[] = { 16 }; + + const int num_collisions = sizeof(collisions) / sizeof(uint32_t); + + // set up file headers + metrics_out << "Capacity , Unique , Percent Full , Attempted Inserts , Failed Inserts , Collision Ratio , Nanoseconds/Inserts, Seconds" << std::endl; + length_out << "Capacity , Percent Full , "; + distance_out << "Capacity , Percent Full , "; + block_distance_out << "Capacity , Percent Full , "; + + for (int i=0; i<100; ++i) { + length_out << i << " , "; + distance_out << i << " , "; + block_distance_out << i << " , "; + } + + length_out << "\b\b\b " << std::endl; + distance_out << "\b\b\b " << std::endl; + block_distance_out << "\b\b\b " << std::endl; + + Kokkos::Impl::Timer wall_clock ; + for (int i=0; i < num_collisions ; ++i) { + wall_clock.reset(); + std::cout << "Collisions: " << collisions[i] << std::endl; + for (int j = 0; j < num_ratios; ++j) { + std::cout << std::setprecision(1) << std::fixed << std::setw(5) << (100.0*test_ratios[j]) << "% " << std::flush; + for (uint32_t capacity = 1<<14; capacity < 1<<25; capacity = capacity << 1) { + uint32_t inserts = static_cast<uint32_t>(test_ratios[j]*(capacity)); + std::cout << capacity << std::flush; + UnorderedMapTest<Device, Near> test(capacity, inserts*collisions[i], collisions[i]); + Device::fence(); + test.print(metrics_out, length_out, distance_out, block_distance_out); + } + std::cout << "\b\b " << std::endl; + + } + std::cout << " " << wall_clock.seconds() << " secs" << std::endl; + } + metrics_out.close(); + length_out.close(); + distance_out.close(); + block_distance_out.close(); +#else + (void)base_file_name; + std::cout << "skipping test" << std::endl; +#endif +} + + +} // namespace Perf + +#endif //KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp new file mode 100755 index 0000000000000000000000000000000000000000..b51b1c2b26560bc67a6e5e421242436cc0d435ce --- /dev/null +++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp @@ -0,0 +1,437 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_BITSET_HPP +#define KOKKOS_BITSET_HPP + +#include <Kokkos_Core.hpp> +#include <Kokkos_Functional.hpp> + +#include <impl/Kokkos_Bitset_impl.hpp> + +#include <stdexcept> + +namespace Kokkos { + +template <typename Device = Kokkos::DefaultExecutionSpace > +class Bitset; + +template <typename Device = Kokkos::DefaultExecutionSpace > +class ConstBitset; + +template <typename DstDevice, typename SrcDevice> +void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src); + +template <typename DstDevice, typename SrcDevice> +void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src); + +template <typename DstDevice, typename SrcDevice> +void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src); + + +/// A thread safe view to a bitset +template <typename Device> +class Bitset +{ +public: + typedef Device execution_space; + typedef unsigned size_type; + + enum { BIT_SCAN_REVERSE = 1u }; + enum { MOVE_HINT_BACKWARD = 2u }; + + enum { + BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u + , BIT_SCAN_REVERSE_MOVE_HINT_FORWARD = BIT_SCAN_REVERSE + , BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD + , BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD + }; + +private: + enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) }; + enum { block_mask = block_size-1u }; + enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) }; + +public: + + + /// constructor + /// arg_size := number of bit in set + Bitset(unsigned arg_size = 0u) + : m_size(arg_size) + , m_last_block_mask(0u) + , m_blocks("Bitset", ((m_size + block_mask) >> block_shift) ) + { + for (int i=0, end = static_cast<int>(m_size & block_mask); i < end; ++i) { + m_last_block_mask |= 1u << i; + } + } + + /// assignment + Bitset<Device> & operator = (Bitset<Device> const & rhs) + { + this->m_size = rhs.m_size; + this->m_last_block_mask = rhs.m_last_block_mask; + this->m_blocks = rhs.m_blocks; + + return *this; + } + + /// copy constructor + Bitset( Bitset<Device> const & rhs) + : m_size( rhs.m_size ) + , m_last_block_mask( rhs.m_last_block_mask ) + , m_blocks( rhs.m_blocks ) + {} + + /// number of bits in the set + /// can be call from the host or the device + KOKKOS_FORCEINLINE_FUNCTION + unsigned size() const + { return m_size; } + + /// number of bits which are set to 1 + /// can only be called from the host + unsigned count() const + { + Impl::BitsetCount< Bitset<Device> > f(*this); + return f.apply(); + } + + /// set all bits to 1 + /// can only be called from the host + void set() + { + Kokkos::deep_copy(m_blocks, ~0u ); + + if (m_last_block_mask) { + //clear the unused bits in the last block + typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy; + raw_deep_copy( m_blocks.ptr_on_device() + (m_blocks.dimension_0() -1u), &m_last_block_mask, sizeof(unsigned)); + } + } + + /// set all bits to 0 + /// can only be called from the host + void reset() + { + Kokkos::deep_copy(m_blocks, 0u ); + } + + /// set all bits to 0 + /// can only be called from the host + void clear() + { + Kokkos::deep_copy(m_blocks, 0u ); + } + + /// set i'th bit to 1 + /// can only be called from the device + KOKKOS_FORCEINLINE_FUNCTION + bool set( unsigned i ) const + { + if ( i < m_size ) { + unsigned * block_ptr = &m_blocks[ i >> block_shift ]; + const unsigned mask = 1u << static_cast<int>( i & block_mask ); + + return !( atomic_fetch_or( block_ptr, mask ) & mask ); + } + return false; + } + + /// set i'th bit to 0 + /// can only be called from the device + KOKKOS_FORCEINLINE_FUNCTION + bool reset( unsigned i ) const + { + if ( i < m_size ) { + unsigned * block_ptr = &m_blocks[ i >> block_shift ]; + const unsigned mask = 1u << static_cast<int>( i & block_mask ); + + return atomic_fetch_and( block_ptr, ~mask ) & mask; + } + return false; + } + + /// return true if the i'th bit set to 1 + /// can only be called from the device + KOKKOS_FORCEINLINE_FUNCTION + bool test( unsigned i ) const + { + if ( i < m_size ) { + const unsigned block = volatile_load(&m_blocks[ i >> block_shift ]); + const unsigned mask = 1u << static_cast<int>( i & block_mask ); + return block & mask; + } + return false; + } + + /// used with find_any_set_near or find_any_unset_near functions + /// returns the max number of times those functions should be call + /// when searching for an available bit + KOKKOS_FORCEINLINE_FUNCTION + unsigned max_hint() const + { + return m_blocks.dimension_0(); + } + + /// find a bit set to 1 near the hint + /// returns a pair< bool, unsigned> where if result.first is true then result.second is the bit found + /// and if result.first is false the result.second is a new hint + KOKKOS_INLINE_FUNCTION + Kokkos::pair<bool, unsigned> find_any_set_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const + { + const unsigned block_idx = (hint >> block_shift) < m_blocks.dimension_0() ? (hint >> block_shift) : 0; + const unsigned offset = hint & block_mask; + unsigned block = volatile_load(&m_blocks[ block_idx ]); + block = !m_last_block_mask || (block_idx < (m_blocks.dimension_0()-1)) ? block : block & m_last_block_mask ; + + return find_any_helper(block_idx, offset, block, scan_direction); + } + + /// find a bit set to 0 near the hint + /// returns a pair< bool, unsigned> where if result.first is true then result.second is the bit found + /// and if result.first is false the result.second is a new hint + KOKKOS_INLINE_FUNCTION + Kokkos::pair<bool, unsigned> find_any_unset_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const + { + const unsigned block_idx = hint >> block_shift; + const unsigned offset = hint & block_mask; + unsigned block = volatile_load(&m_blocks[ block_idx ]); + block = !m_last_block_mask || (block_idx < (m_blocks.dimension_0()-1) ) ? ~block : ~block & m_last_block_mask ; + + return find_any_helper(block_idx, offset, block, scan_direction); + } + +private: + + KOKKOS_FORCEINLINE_FUNCTION + Kokkos::pair<bool, unsigned> find_any_helper(unsigned block_idx, unsigned offset, unsigned block, unsigned scan_direction) const + { + Kokkos::pair<bool, unsigned> result( block > 0u, 0); + + if (!result.first) { + result.second = update_hint( block_idx, offset, scan_direction ); + } + else { + result.second = scan_block( (block_idx << block_shift) + , offset + , block + , scan_direction + ); + } + return result; + } + + + KOKKOS_FORCEINLINE_FUNCTION + unsigned scan_block(unsigned block_start, int offset, unsigned block, unsigned scan_direction ) const + { + offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask; + block = Impl::rotate_right(block, offset); + return ((( !(scan_direction & BIT_SCAN_REVERSE) ? + Impl::bit_scan_forward(block) : + Impl::bit_scan_reverse(block) + ) + offset + ) & block_mask + ) + block_start; + } + + KOKKOS_FORCEINLINE_FUNCTION + unsigned update_hint( long long block_idx, unsigned offset, unsigned scan_direction ) const + { + block_idx += scan_direction & MOVE_HINT_BACKWARD ? -1 : 1; + block_idx = block_idx >= 0 ? block_idx : m_blocks.dimension_0() - 1; + block_idx = block_idx < static_cast<long long>(m_blocks.dimension_0()) ? block_idx : 0; + + return static_cast<unsigned>(block_idx)*block_size + offset; + } + +private: + + unsigned m_size; + unsigned m_last_block_mask; + View< unsigned *, execution_space, MemoryTraits<RandomAccess> > m_blocks; + +private: + template <typename DDevice> + friend class Bitset; + + template <typename DDevice> + friend class ConstBitset; + + template <typename Bitset> + friend struct Impl::BitsetCount; + + template <typename DstDevice, typename SrcDevice> + friend void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src); + + template <typename DstDevice, typename SrcDevice> + friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src); +}; + +/// a thread-safe view to a const bitset +/// i.e. can only test bits +template <typename Device> +class ConstBitset +{ +public: + typedef Device execution_space; + typedef unsigned size_type; + +private: + enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) }; + enum { block_mask = block_size -1u }; + enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) }; + +public: + ConstBitset() + : m_size (0) + {} + + ConstBitset(Bitset<Device> const& rhs) + : m_size(rhs.m_size) + , m_blocks(rhs.m_blocks) + {} + + ConstBitset(ConstBitset<Device> const& rhs) + : m_size( rhs.m_size ) + , m_blocks( rhs.m_blocks ) + {} + + ConstBitset<Device> & operator = (Bitset<Device> const & rhs) + { + this->m_size = rhs.m_size; + this->m_blocks = rhs.m_blocks; + + return *this; + } + + ConstBitset<Device> & operator = (ConstBitset<Device> const & rhs) + { + this->m_size = rhs.m_size; + this->m_blocks = rhs.m_blocks; + + return *this; + } + + + KOKKOS_FORCEINLINE_FUNCTION + unsigned size() const + { + return m_size; + } + + unsigned count() const + { + Impl::BitsetCount< ConstBitset<Device> > f(*this); + return f.apply(); + } + + KOKKOS_FORCEINLINE_FUNCTION + bool test( unsigned i ) const + { + if ( i < m_size ) { + const unsigned block = m_blocks[ i >> block_shift ]; + const unsigned mask = 1u << static_cast<int>( i & block_mask ); + return block & mask; + } + return false; + } + +private: + + unsigned m_size; + View< const unsigned *, execution_space, MemoryTraits<RandomAccess> > m_blocks; + +private: + template <typename DDevice> + friend class ConstBitset; + + template <typename Bitset> + friend struct Impl::BitsetCount; + + template <typename DstDevice, typename SrcDevice> + friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src); + + template <typename DstDevice, typename SrcDevice> + friend void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src); +}; + + +template <typename DstDevice, typename SrcDevice> +void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src) +{ + if (dst.size() != src.size()) { + throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!"); + } + + typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy; + raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.dimension_0()); +} + +template <typename DstDevice, typename SrcDevice> +void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src) +{ + if (dst.size() != src.size()) { + throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!"); + } + + typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy; + raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.dimension_0()); +} + +template <typename DstDevice, typename SrcDevice> +void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src) +{ + if (dst.size() != src.size()) { + throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!"); + } + + typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy; + raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.dimension_0()); +} + +} // namespace Kokkos + +#endif //KOKKOS_BITSET_HPP diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp new file mode 100755 index 0000000000000000000000000000000000000000..95eea57e9258cee18b4dbb0b9084d843739da88f --- /dev/null +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -0,0 +1,840 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_DualView.hpp +/// \brief Declaration and definition of Kokkos::DualView. +/// +/// This header file declares and defines Kokkos::DualView and its +/// related nonmember functions. + +#ifndef KOKKOS_DUALVIEW_HPP +#define KOKKOS_DUALVIEW_HPP + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Error.hpp> + +namespace Kokkos { + +/* \class DualView + * \brief Container to manage mirroring a Kokkos::View that lives + * in device memory with a Kokkos::View that lives in host memory. + * + * This class provides capabilities to manage data which exists in two + * memory spaces at the same time. It keeps views of the same layout + * on two memory spaces as well as modified flags for both + * allocations. Users are responsible for setting the modified flags + * manually if they change the data in either memory space, by calling + * the sync() method templated on the device where they modified the + * data. Users may synchronize data by calling the modify() function, + * templated on the device towards which they want to synchronize + * (i.e., the target of the one-way copy operation). + * + * The DualView class also provides convenience methods such as + * realloc, resize and capacity which call the appropriate methods of + * the underlying Kokkos::View objects. + * + * The four template arguments are the same as those of Kokkos::View. + * (Please refer to that class' documentation for a detailed + * description.) + * + * \tparam DataType The type of the entries stored in the container. + * + * \tparam Layout The array's layout in memory. + * + * \tparam Device The Kokkos Device type. If its memory space is + * not the same as the host's memory space, then DualView will + * contain two separate Views: one in device memory, and one in + * host memory. Otherwise, DualView will only store one View. + * + * \tparam MemoryTraits (optional) The user's intended memory access + * behavior. Please see the documentation of Kokkos::View for + * examples. The default suffices for most users. + */ +template< class DataType , + class Arg1Type = void , + class Arg2Type = void , + class Arg3Type = void> +class DualView : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > +{ +public: + //! \name Typedefs for device types and various Kokkos::View specializations. + //@{ + typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ; + + //! The Kokkos Host Device type; + typedef typename traits::host_mirror_space host_mirror_space ; + + //! The type of a Kokkos::View on the device. + typedef View< typename traits::data_type , + typename traits::array_layout , + typename traits::device_type , + typename traits::memory_traits > t_dev ; + + /// \typedef t_host + /// \brief The type of a Kokkos::View host mirror of \c t_dev. + typedef typename t_dev::HostMirror t_host ; + + //! The type of a const View on the device. + //! The type of a Kokkos::View on the device. + typedef View< typename traits::const_data_type , + typename traits::array_layout , + typename traits::device_type , + typename traits::memory_traits > t_dev_const ; + + /// \typedef t_host_const + /// \brief The type of a const View host mirror of \c t_dev_const. + typedef typename t_dev_const::HostMirror t_host_const; + + //! The type of a const, random-access View on the device. + typedef View< typename traits::const_data_type , + typename traits::array_layout , + typename traits::device_type , + MemoryRandomAccess > t_dev_const_randomread ; + + /// \typedef t_host_const_randomread + /// \brief The type of a const, random-access View host mirror of + /// \c t_dev_const_randomread. + typedef typename t_dev_const_randomread::HostMirror t_host_const_randomread; + + //! The type of an unmanaged View on the device. + typedef View< typename traits::data_type , + typename traits::array_layout , + typename traits::device_type , + MemoryUnmanaged> t_dev_um; + + //! The type of an unmanaged View host mirror of \c t_dev_um. + typedef View< typename t_host::data_type , + typename t_host::array_layout , + typename t_host::device_type , + MemoryUnmanaged> t_host_um; + + //! The type of a const unmanaged View on the device. + typedef View< typename traits::const_data_type , + typename traits::array_layout , + typename traits::device_type , + MemoryUnmanaged> t_dev_const_um; + + //! The type of a const unmanaged View host mirror of \c t_dev_const_um. + typedef View<typename t_host::const_data_type, + typename t_host::array_layout, + typename t_host::device_type, + MemoryUnmanaged> t_host_const_um; + + //@} + //! \name The two View instances. + //@{ + + t_dev d_view; + t_host h_view; + + //@} + //! \name Counters to keep track of changes ("modified" flags) + //@{ + + View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_device; + View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_host; + + //@} + //! \name Constructors + //@{ + + /// \brief Empty constructor. + /// + /// Both device and host View objects are constructed using their + /// default constructors. The "modified" flags are both initialized + /// to "unmodified." + DualView () : + modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")), + modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host")) + {} + + /// \brief Constructor that allocates View objects on both host and device. + /// + /// This constructor works like the analogous constructor of View. + /// The first argument is a string label, which is entirely for your + /// benefit. (Different DualView objects may have the same label if + /// you like.) The arguments that follow are the dimensions of the + /// View objects. For example, if the View has three dimensions, + /// the first three integer arguments will be nonzero, and you may + /// omit the integer arguments that follow. + DualView (const std::string& label, + const size_t n0 = 0, + const size_t n1 = 0, + const size_t n2 = 0, + const size_t n3 = 0, + const size_t n4 = 0, + const size_t n5 = 0, + const size_t n6 = 0, + const size_t n7 = 0) + : d_view (label, n0, n1, n2, n3, n4, n5, n6, n7) + , h_view (create_mirror_view (d_view)) // without UVM, host View mirrors + , modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")) + , modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host")) + {} + + //! Copy constructor (shallow copy) + template<class SS, class LS, class DS, class MS> + DualView (const DualView<SS,LS,DS,MS>& src) : + d_view (src.d_view), + h_view (src.h_view), + modified_device (src.modified_device), + modified_host (src.modified_host) + {} + + /// \brief Create DualView from existing device and host View objects. + /// + /// This constructor assumes that the device and host View objects + /// are synchronized. You, the caller, are responsible for making + /// sure this is the case before calling this constructor. After + /// this constructor returns, you may use DualView's sync() and + /// modify() methods to ensure synchronization of the View objects. + /// + /// \param d_view_ Device View + /// \param h_view_ Host View (must have type t_host = t_dev::HostMirror) + DualView (const t_dev& d_view_, const t_host& h_view_) : + d_view (d_view_), + h_view (h_view_), + modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")), + modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host")) + { + Impl::assert_shapes_are_equal (d_view.shape (), h_view.shape ()); + } + + //@} + //! \name Methods for synchronizing, marking as modified, and getting Views. + //@{ + + /// \brief Return a View on a specific device \c Device. + /// + /// Please don't be afraid of the if_c expression in the return + /// value's type. That just tells the method what the return type + /// should be: t_dev if the \c Device template parameter matches + /// this DualView's device type, else t_host. + /// + /// For example, suppose you create a DualView on Cuda, like this: + /// \code + /// typedef Kokkos::DualView<float, Kokkos::LayoutRight, Kokkos::Cuda> dual_view_type; + /// dual_view_type DV ("my dual view", 100); + /// \endcode + /// If you want to get the CUDA device View, do this: + /// \code + /// typename dual_view_type::t_dev cudaView = DV.view<Kokkos::Cuda> (); + /// \endcode + /// and if you want to get the host mirror of that View, do this: + /// \code + /// typedef typename Kokkos::HostSpace::execution_space host_device_type; + /// typename dual_view_type::t_host hostView = DV.view<host_device_type> (); + /// \endcode + template< class Device > + KOKKOS_INLINE_FUNCTION + const typename Impl::if_c< + Impl::is_same<typename t_dev::memory_space, + typename Device::memory_space>::value, + t_dev, + t_host>::type& view () const + { + return Impl::if_c< + Impl::is_same< + typename t_dev::memory_space, + typename Device::memory_space>::value, + t_dev, + t_host >::select (d_view , h_view); + } + + /// \brief Update data on device or host only if data in the other + /// space has been marked as modified. + /// + /// If \c Device is the same as this DualView's device type, then + /// copy data from host to device. Otherwise, copy data from device + /// to host. In either case, only copy if the source of the copy + /// has been modified. + /// + /// This is a one-way synchronization only. If the target of the + /// copy has been modified, this operation will discard those + /// modifications. It will also reset both device and host modified + /// flags. + /// + /// \note This method doesn't know on its own whether you modified + /// the data in either View. You must manually mark modified data + /// as modified, by calling the modify() method with the + /// appropriate template parameter. + template<class Device> + void sync( const typename Impl::enable_if< + ( Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value) || + ( Impl::is_same< Device , int>::value) + , int >::type& = 0) + { + const unsigned int dev = + Impl::if_c< + Impl::is_same< + typename t_dev::memory_space, + typename Device::memory_space>::value , + unsigned int, + unsigned int>::select (1, 0); + + if (dev) { // if Device is the same as DualView's device type + if ((modified_host () > 0) && (modified_host () >= modified_device ())) { + deep_copy (d_view, h_view); + modified_host() = modified_device() = 0; + } + } else { // hopefully Device is the same as DualView's host type + if ((modified_device () > 0) && (modified_device () >= modified_host ())) { + deep_copy (h_view, d_view); + modified_host() = modified_device() = 0; + } + } + if(Impl::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) { + t_dev::execution_space::fence(); + t_host::execution_space::fence(); + } + } + + template<class Device> + void sync ( const typename Impl::enable_if< + ( ! Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value ) || + ( Impl::is_same< Device , int>::value) + , int >::type& = 0 ) + { + const unsigned int dev = + Impl::if_c< + Impl::is_same< + typename t_dev::memory_space, + typename Device::memory_space>::value, + unsigned int, + unsigned int>::select (1, 0); + if (dev) { // if Device is the same as DualView's device type + if ((modified_host () > 0) && (modified_host () >= modified_device ())) { + Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype."); + } + } else { // hopefully Device is the same as DualView's host type + if ((modified_device () > 0) && (modified_device () >= modified_host ())) { + Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype."); + } + } + } + /// \brief Mark data as modified on the given device \c Device. + /// + /// If \c Device is the same as this DualView's device type, then + /// mark the device's data as modified. Otherwise, mark the host's + /// data as modified. + template<class Device> + void modify () { + const unsigned int dev = + Impl::if_c< + Impl::is_same< + typename t_dev::memory_space, + typename Device::memory_space>::value, + unsigned int, + unsigned int>::select (1, 0); + + if (dev) { // if Device is the same as DualView's device type + // Increment the device's modified count. + modified_device () = (modified_device () > modified_host () ? + modified_device () : modified_host ()) + 1; + } else { // hopefully Device is the same as DualView's host type + // Increment the host's modified count. + modified_host () = (modified_device () > modified_host () ? + modified_device () : modified_host ()) + 1; + } + } + + //@} + //! \name Methods for reallocating or resizing the View objects. + //@{ + + /// \brief Reallocate both View objects. + /// + /// This discards any existing contents of the objects, and resets + /// their modified flags. It does <i>not</i> copy the old contents + /// of either View into the new View objects. + void realloc( const size_t n0 = 0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 ) { + ::Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7); + h_view = create_mirror_view( d_view ); + + /* Reset dirty flags */ + modified_device() = modified_host() = 0; + } + + /// \brief Resize both views, copying old contents into new if necessary. + /// + /// This method only copies the old contents into the new View + /// objects for the device which was last marked as modified. + void resize( const size_t n0 = 0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 ) { + if(modified_device() >= modified_host()) { + /* Resize on Device */ + ::Kokkos::resize(d_view,n0,n1,n2,n3,n4,n5,n6,n7); + h_view = create_mirror_view( d_view ); + + /* Mark Device copy as modified */ + modified_device() = modified_device()+1; + + } else { + /* Realloc on Device */ + + ::Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7); + t_host temp_view = create_mirror_view( d_view ); + + /* Remap on Host */ + Kokkos::deep_copy( temp_view , h_view ); + + h_view = temp_view; + + /* Mark Host copy as modified */ + modified_host() = modified_host()+1; + } + } + + //@} + //! \name Methods for getting capacity, stride, or dimension(s). + //@{ + + //! The allocation size (same as Kokkos::View::capacity). + size_t capacity() const { +#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + return d_view.span(); +#else + return d_view.capacity(); +#endif + } + + //! Get stride(s) for each dimension. + template< typename iType> + void stride(iType* stride_) const { + d_view.stride(stride_); + } + + /* \brief return size of dimension 0 */ + size_t dimension_0() const {return d_view.dimension_0();} + /* \brief return size of dimension 1 */ + size_t dimension_1() const {return d_view.dimension_1();} + /* \brief return size of dimension 2 */ + size_t dimension_2() const {return d_view.dimension_2();} + /* \brief return size of dimension 3 */ + size_t dimension_3() const {return d_view.dimension_3();} + /* \brief return size of dimension 4 */ + size_t dimension_4() const {return d_view.dimension_4();} + /* \brief return size of dimension 5 */ + size_t dimension_5() const {return d_view.dimension_5();} + /* \brief return size of dimension 6 */ + size_t dimension_6() const {return d_view.dimension_6();} + /* \brief return size of dimension 7 */ + size_t dimension_7() const {return d_view.dimension_7();} + + //@} +}; + +} // namespace Kokkos +// +// Partial specializations of Kokkos::subview() for DualView objects. +// + +namespace Kokkos { +namespace Impl { + +template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type + > +struct ViewSubview< DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type > + , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type + , SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type > +{ +private: + + typedef DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type > SrcViewType ; + + enum { V0 = Impl::is_same< SubArg0_type , void >::value ? 1 : 0 }; + enum { V1 = Impl::is_same< SubArg1_type , void >::value ? 1 : 0 }; + enum { V2 = Impl::is_same< SubArg2_type , void >::value ? 1 : 0 }; + enum { V3 = Impl::is_same< SubArg3_type , void >::value ? 1 : 0 }; + enum { V4 = Impl::is_same< SubArg4_type , void >::value ? 1 : 0 }; + enum { V5 = Impl::is_same< SubArg5_type , void >::value ? 1 : 0 }; + enum { V6 = Impl::is_same< SubArg6_type , void >::value ? 1 : 0 }; + enum { V7 = Impl::is_same< SubArg7_type , void >::value ? 1 : 0 }; + + // The source view rank must be equal to the input argument rank + // Once a void argument is encountered all subsequent arguments must be void. + enum { InputRank = + Impl::StaticAssert<( SrcViewType::rank == + ( V0 ? 0 : ( + V1 ? 1 : ( + V2 ? 2 : ( + V3 ? 3 : ( + V4 ? 4 : ( + V5 ? 5 : ( + V6 ? 6 : ( + V7 ? 7 : 8 ))))))) )) + && + ( SrcViewType::rank == + ( 8 - ( V0 + V1 + V2 + V3 + V4 + V5 + V6 + V7 ) ) ) + >::value ? SrcViewType::rank : 0 }; + + enum { R0 = Impl::ViewOffsetRange< SubArg0_type >::is_range ? 1 : 0 }; + enum { R1 = Impl::ViewOffsetRange< SubArg1_type >::is_range ? 1 : 0 }; + enum { R2 = Impl::ViewOffsetRange< SubArg2_type >::is_range ? 1 : 0 }; + enum { R3 = Impl::ViewOffsetRange< SubArg3_type >::is_range ? 1 : 0 }; + enum { R4 = Impl::ViewOffsetRange< SubArg4_type >::is_range ? 1 : 0 }; + enum { R5 = Impl::ViewOffsetRange< SubArg5_type >::is_range ? 1 : 0 }; + enum { R6 = Impl::ViewOffsetRange< SubArg6_type >::is_range ? 1 : 0 }; + enum { R7 = Impl::ViewOffsetRange< SubArg7_type >::is_range ? 1 : 0 }; + + enum { OutputRank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) }; + + // Reverse + enum { R0_rev = 0 == InputRank ? 0u : ( + 1 == InputRank ? unsigned(R0) : ( + 2 == InputRank ? unsigned(R1) : ( + 3 == InputRank ? unsigned(R2) : ( + 4 == InputRank ? unsigned(R3) : ( + 5 == InputRank ? unsigned(R4) : ( + 6 == InputRank ? unsigned(R5) : ( + 7 == InputRank ? unsigned(R6) : unsigned(R7) ))))))) }; + + typedef typename SrcViewType::array_layout SrcViewLayout ; + + // Choose array layout, attempting to preserve original layout if at all possible. + typedef typename Impl::if_c< + ( // Same Layout IF + // OutputRank 0 + ( OutputRank == 0 ) + || + // OutputRank 1 or 2, InputLayout Left, Interval 0 + // because single stride one or second index has a stride. + ( OutputRank <= 2 && R0 && Impl::is_same<SrcViewLayout,LayoutLeft>::value ) + || + // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1] + // because single stride one or second index has a stride. + ( OutputRank <= 2 && R0_rev && Impl::is_same<SrcViewLayout,LayoutRight>::value ) + ), SrcViewLayout , Kokkos::LayoutStride >::type OutputViewLayout ; + + // Choose data type as a purely dynamic rank array to accomodate a runtime range. + typedef typename Impl::if_c< OutputRank == 0 , typename SrcViewType::value_type , + typename Impl::if_c< OutputRank == 1 , typename SrcViewType::value_type *, + typename Impl::if_c< OutputRank == 2 , typename SrcViewType::value_type **, + typename Impl::if_c< OutputRank == 3 , typename SrcViewType::value_type ***, + typename Impl::if_c< OutputRank == 4 , typename SrcViewType::value_type ****, + typename Impl::if_c< OutputRank == 5 , typename SrcViewType::value_type *****, + typename Impl::if_c< OutputRank == 6 , typename SrcViewType::value_type ******, + typename Impl::if_c< OutputRank == 7 , typename SrcViewType::value_type *******, + typename SrcViewType::value_type ******** + >::type >::type >::type >::type >::type >::type >::type >::type OutputData ; + + // Choose space. + // If the source view's template arg1 or arg2 is a space then use it, + // otherwise use the source view's execution space. + + typedef typename Impl::if_c< Impl::is_space< SrcArg1Type >::value , SrcArg1Type , + typename Impl::if_c< Impl::is_space< SrcArg2Type >::value , SrcArg2Type , typename SrcViewType::execution_space + >::type >::type OutputSpace ; + +public: + + // If keeping the layout then match non-data type arguments + // else keep execution space and memory traits. + typedef typename + Impl::if_c< Impl::is_same< SrcViewLayout , OutputViewLayout >::value + , Kokkos::DualView< OutputData , SrcArg1Type , SrcArg2Type , SrcArg3Type > + , Kokkos::DualView< OutputData , OutputViewLayout , OutputSpace + , typename SrcViewType::memory_traits > + >::type type ; +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +namespace Kokkos { + +template< class D , class A1 , class A2 , class A3 , + class ArgType0 > +typename Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , void , void , void + , void , void , void , void + >::type +subview( const DualView<D,A1,A2,A3> & src , + const ArgType0 & arg0 ) +{ + typedef typename + Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , void , void , void + , void , void , void , void + >::type + DstViewType ; + DstViewType sub_view; + sub_view.d_view = subview(src.d_view,arg0); + sub_view.h_view = subview(src.h_view,arg0); + sub_view.modified_device = src.modified_device; + sub_view.modified_host = src.modified_host; + return sub_view; +} + + +template< class D , class A1 , class A2 , class A3 , + class ArgType0 , class ArgType1 > +typename Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , void , void + , void , void , void , void + >::type +subview( const DualView<D,A1,A2,A3> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 ) +{ + typedef typename + Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , void , void + , void , void , void , void + >::type + DstViewType ; + DstViewType sub_view; + sub_view.d_view = subview(src.d_view,arg0,arg1); + sub_view.h_view = subview(src.h_view,arg0,arg1); + sub_view.modified_device = src.modified_device; + sub_view.modified_host = src.modified_host; + return sub_view; +} + +template< class D , class A1 , class A2 , class A3 , + class ArgType0 , class ArgType1 , class ArgType2 > +typename Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , void + , void , void , void , void + >::type +subview( const DualView<D,A1,A2,A3> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 ) +{ + typedef typename + Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , void + , void , void , void , void + >::type + DstViewType ; + DstViewType sub_view; + sub_view.d_view = subview(src.d_view,arg0,arg1,arg2); + sub_view.h_view = subview(src.h_view,arg0,arg1,arg2); + sub_view.modified_device = src.modified_device; + sub_view.modified_host = src.modified_host; + return sub_view; +} + +template< class D , class A1 , class A2 , class A3 , + class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 > +typename Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , void , void , void , void + >::type +subview( const DualView<D,A1,A2,A3> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 , + const ArgType3 & arg3 ) +{ + typedef typename + Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , void , void , void , void + >::type + DstViewType ; + DstViewType sub_view; + sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3); + sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3); + sub_view.modified_device = src.modified_device; + sub_view.modified_host = src.modified_host; + return sub_view; +} + +template< class D , class A1 , class A2 , class A3 , + class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , + class ArgType4 > +typename Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , void , void , void + >::type +subview( const DualView<D,A1,A2,A3> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 , + const ArgType3 & arg3 , + const ArgType4 & arg4 ) +{ + typedef typename + Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , void , void ,void + >::type + DstViewType ; + DstViewType sub_view; + sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4); + sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4); + sub_view.modified_device = src.modified_device; + sub_view.modified_host = src.modified_host; + return sub_view; +} + +template< class D , class A1 , class A2 , class A3 , + class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , + class ArgType4 , class ArgType5 > +typename Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , void , void + >::type +subview( const DualView<D,A1,A2,A3> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 , + const ArgType3 & arg3 , + const ArgType4 & arg4 , + const ArgType5 & arg5 ) +{ + typedef typename + Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , void , void + >::type + DstViewType ; + DstViewType sub_view; + sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5); + sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5); + sub_view.modified_device = src.modified_device; + sub_view.modified_host = src.modified_host; + return sub_view; +} + +template< class D , class A1 , class A2 , class A3 , + class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , + class ArgType4 , class ArgType5 , class ArgType6 > +typename Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , ArgType6 , void + >::type +subview( const DualView<D,A1,A2,A3> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 , + const ArgType3 & arg3 , + const ArgType4 & arg4 , + const ArgType5 & arg5 , + const ArgType6 & arg6 ) +{ + typedef typename + Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , ArgType6 , void + >::type + DstViewType ; + DstViewType sub_view; + sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6); + sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6); + sub_view.modified_device = src.modified_device; + sub_view.modified_host = src.modified_host; + return sub_view; +} + +template< class D , class A1 , class A2 , class A3 , + class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , + class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 > +typename Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , ArgType6 , ArgType7 + >::type +subview( const DualView<D,A1,A2,A3> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 , + const ArgType3 & arg3 , + const ArgType4 & arg4 , + const ArgType5 & arg5 , + const ArgType6 & arg6 , + const ArgType7 & arg7 ) +{ + typedef typename + Impl::ViewSubview< DualView<D,A1,A2,A3> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , ArgType6 , ArgType7 + >::type + DstViewType ; + DstViewType sub_view; + sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7); + sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7); + sub_view.modified_device = src.modified_device; + sub_view.modified_host = src.modified_host; + return sub_view; +} + +// +// Partial specialization of Kokkos::deep_copy() for DualView objects. +// + +template< class DT , class DL , class DD , class DM , + class ST , class SL , class SD , class SM > +void +deep_copy (DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference + const DualView<ST,SL,SD,SM>& src ) +{ + if (src.modified_device () >= src.modified_host ()) { + deep_copy (dst.d_view, src.d_view); + dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> (); + } else { + deep_copy (dst.h_view, src.h_view); + dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_space> (); + } +} + +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/containers/src/Kokkos_Functional.hpp b/lib/kokkos/containers/src/Kokkos_Functional.hpp new file mode 100755 index 0000000000000000000000000000000000000000..5c7350ef1cd3bb1ed68deff0c823ce3f7a5a3619 --- /dev/null +++ b/lib/kokkos/containers/src/Kokkos_Functional.hpp @@ -0,0 +1,173 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_FUNCTIONAL_HPP +#define KOKKOS_FUNCTIONAL_HPP + +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_Functional_impl.hpp> + +namespace Kokkos { + +// These should work for most types + +template <typename T> +struct pod_hash +{ + typedef T argument_type; + typedef T first_argument_type; + typedef uint32_t second_argument_type; + typedef uint32_t result_type; + + KOKKOS_FORCEINLINE_FUNCTION + uint32_t operator()(T const & t) const + { return Impl::MurmurHash3_x86_32( &t, sizeof(T), 0); } + + KOKKOS_FORCEINLINE_FUNCTION + uint32_t operator()(T const & t, uint32_t seed) const + { return Impl::MurmurHash3_x86_32( &t, sizeof(T), seed); } +}; + +template <typename T> +struct pod_equal_to +{ + typedef T first_argument_type; + typedef T second_argument_type; + typedef bool result_type; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const & a, T const & b) const + { return Impl::bitwise_equal(&a,&b); } +}; + +template <typename T> +struct pod_not_equal_to +{ + typedef T first_argument_type; + typedef T second_argument_type; + typedef bool result_type; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const & a, T const & b) const + { return !Impl::bitwise_equal(&a,&b); } +}; + +template <typename T> +struct equal_to +{ + typedef T first_argument_type; + typedef T second_argument_type; + typedef bool result_type; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const & a, T const & b) const + { return a == b; } +}; + +template <typename T> +struct not_equal_to +{ + typedef T first_argument_type; + typedef T second_argument_type; + typedef bool result_type; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const & a, T const & b) const + { return a != b; } +}; + + +template <typename T> +struct greater +{ + typedef T first_argument_type; + typedef T second_argument_type; + typedef bool result_type; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const & a, T const & b) const + { return a > b; } +}; + + +template <typename T> +struct less +{ + typedef T first_argument_type; + typedef T second_argument_type; + typedef bool result_type; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const & a, T const & b) const + { return a < b; } +}; + +template <typename T> +struct greater_equal +{ + typedef T first_argument_type; + typedef T second_argument_type; + typedef bool result_type; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const & a, T const & b) const + { return a >= b; } +}; + + +template <typename T> +struct less_equal +{ + typedef T first_argument_type; + typedef T second_argument_type; + typedef bool result_type; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const & a, T const & b) const + { return a <= b; } +}; + +} // namespace Kokkos + + +#endif //KOKKOS_FUNCTIONAL_HPP + + diff --git a/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp b/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp new file mode 100755 index 0000000000000000000000000000000000000000..3f328ba9563f01421c93dda8e8eeafbc2d679968 --- /dev/null +++ b/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp @@ -0,0 +1,531 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SEGMENTED_VIEW_HPP_ +#define KOKKOS_SEGMENTED_VIEW_HPP_ + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Error.hpp> +#include <cstdio> + +#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + +namespace Kokkos { +namespace Experimental { + +namespace Impl { + +template<class DataType, class Arg1Type, class Arg2Type, class Arg3Type> +struct delete_segmented_view; + +template<class MemorySpace> +inline +void DeviceSetAllocatableMemorySize(size_t) {} + +#if defined( KOKKOS_HAVE_CUDA ) + +template<> +inline +void DeviceSetAllocatableMemorySize<Kokkos::CudaSpace>(size_t size) { +#ifdef __CUDACC__ + size_t size_limit; + cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize); + if(size_limit<size) + cudaDeviceSetLimit(cudaLimitMallocHeapSize,2*size); + cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize); +#endif +} + +template<> +inline +void DeviceSetAllocatableMemorySize<Kokkos::CudaUVMSpace>(size_t size) { +#ifdef __CUDACC__ + size_t size_limit; + cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize); + if(size_limit<size) + cudaDeviceSetLimit(cudaLimitMallocHeapSize,2*size); + cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize); +#endif +} + +#endif /* #if defined( KOKKOS_HAVE_CUDA ) */ + +} + +template< class DataType , + class Arg1Type = void , + class Arg2Type = void , + class Arg3Type = void> +class SegmentedView : public Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > +{ +public: + //! \name Typedefs for device types and various Kokkos::View specializations. + //@{ + typedef Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ; + + //! The type of a Kokkos::View on the device. + typedef Kokkos::View< typename traits::data_type , + typename traits::array_layout , + typename traits::memory_space , + Kokkos::MemoryUnmanaged > t_dev ; + + +private: + Kokkos::View<t_dev*,typename traits::memory_space> segments_; + + Kokkos::View<int,typename traits::memory_space> realloc_lock; + Kokkos::View<int,typename traits::memory_space> nsegments_; + + size_t segment_length_; + size_t segment_length_m1_; + int max_segments_; + + int segment_length_log2; + + // Dimensions, cardinality, capacity, and offset computation for + // multidimensional array view of contiguous memory. + // Inherits from Impl::Shape + typedef Kokkos::Impl::ViewOffset< typename traits::shape_type + , typename traits::array_layout + > offset_map_type ; + + offset_map_type m_offset_map ; + + typedef Kokkos::View< typename traits::array_intrinsic_type , + typename traits::array_layout , + typename traits::memory_space , + typename traits::memory_traits > array_type ; + + typedef Kokkos::View< typename traits::const_data_type , + typename traits::array_layout , + typename traits::memory_space , + typename traits::memory_traits > const_type ; + + typedef Kokkos::View< typename traits::non_const_data_type , + typename traits::array_layout , + typename traits::memory_space , + typename traits::memory_traits > non_const_type ; + + typedef Kokkos::View< typename traits::non_const_data_type , + typename traits::array_layout , + HostSpace , + void > HostMirror ; + + template< bool Accessible > + KOKKOS_INLINE_FUNCTION + typename Kokkos::Impl::enable_if< Accessible , typename traits::size_type >::type + dimension_0_intern() const { return nsegments_() * segment_length_ ; } + + template< bool Accessible > + KOKKOS_INLINE_FUNCTION + typename Kokkos::Impl::enable_if< ! Accessible , typename traits::size_type >::type + dimension_0_intern() const + { + // In Host space + int n = 0 ; +#if ! defined( __CUDA_ARCH__ ) + Kokkos::Impl::DeepCopy< HostSpace , typename traits::memory_space >( & n , nsegments_.ptr_on_device() , sizeof(int) ); +#endif + + return n * segment_length_ ; + } + +public: + + enum { Rank = traits::rank }; + + KOKKOS_INLINE_FUNCTION offset_map_type shape() const { return m_offset_map ; } + + /* \brief return (current) size of dimension 0 */ + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { + enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< + Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value }; + int n = SegmentedView::dimension_0_intern< Accessible >(); + return n ; + } + + /* \brief return size of dimension 1 */ + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; } + /* \brief return size of dimension 2 */ + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; } + /* \brief return size of dimension 3 */ + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; } + /* \brief return size of dimension 4 */ + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; } + /* \brief return size of dimension 5 */ + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; } + /* \brief return size of dimension 6 */ + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; } + /* \brief return size of dimension 7 */ + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; } + + /* \brief return size of dimension 2 */ + KOKKOS_INLINE_FUNCTION typename traits::size_type size() const { + return dimension_0() * + m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 * + m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7 ; + } + + template< typename iType > + KOKKOS_INLINE_FUNCTION + typename traits::size_type dimension( const iType & i ) const { + if(i==0) + return dimension_0(); + else + return Kokkos::Impl::dimension( m_offset_map , i ); + } + + KOKKOS_INLINE_FUNCTION + typename traits::size_type capacity() { + return segments_.dimension_0() * + m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 * + m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7; + } + + KOKKOS_INLINE_FUNCTION + typename traits::size_type get_num_segments() { + enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< + Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value }; + int n = SegmentedView::dimension_0_intern< Accessible >(); + return n/segment_length_ ; + } + + KOKKOS_INLINE_FUNCTION + typename traits::size_type get_max_segments() { + return max_segments_; + } + + /// \brief Constructor that allocates View objects with an initial length of 0. + /// + /// This constructor works mostly like the analogous constructor of View. + /// The first argument is a string label, which is entirely for your + /// benefit. (Different SegmentedView objects may have the same label if + /// you like.) The second argument 'view_length' is the size of the segments. + /// This number must be a power of two. The third argument n0 is the maximum + /// value for the first dimension of the segmented view. The maximal allocatable + /// number of Segments is thus: (n0+view_length-1)/view_length. + /// The arguments that follow are the other dimensions of the (1-7) of the + /// View objects. For example, for a View with 3 runtime dimensions, + /// the first 4 integer arguments will be nonzero: + /// SegmentedView("Name",32768,10000000,8,4). This allocates a SegmentedView + /// with a maximum of 306 segments of dimension (32768,8,4). The logical size of + /// the segmented view is (n,8,4) with n between 0 and 10000000. + /// You may omit the integer arguments that follow. + template< class LabelType > + SegmentedView(const LabelType & label , + const size_t view_length , + const size_t n0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 + ): segment_length_(view_length),segment_length_m1_(view_length-1) + { + segment_length_log2 = -1; + size_t l = segment_length_; + while(l>0) { + l>>=1; + segment_length_log2++; + } + l = 1<<segment_length_log2; + if(l!=segment_length_) + Kokkos::Impl::throw_runtime_exception("Kokkos::SegmentedView requires a 'power of 2' segment length"); + + max_segments_ = (n0+segment_length_m1_)/segment_length_; + + Impl::DeviceSetAllocatableMemorySize<typename traits::memory_space>(segment_length_*max_segments_*sizeof(typename traits::value_type)); + + segments_ = Kokkos::View<t_dev*,typename traits::execution_space>(label , max_segments_); + realloc_lock = Kokkos::View<int,typename traits::execution_space>("Lock"); + nsegments_ = Kokkos::View<int,typename traits::execution_space>("nviews"); + m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n0*n1*n2*n3*n4*n5*n6*n7 ); + + } + + KOKKOS_INLINE_FUNCTION + SegmentedView(const SegmentedView& src): + segments_(src.segments_), + realloc_lock (src.realloc_lock), + nsegments_ (src.nsegments_), + segment_length_(src.segment_length_), + segment_length_m1_(src.segment_length_m1_), + max_segments_ (src.max_segments_), + segment_length_log2(src.segment_length_log2), + m_offset_map (src.m_offset_map) + {} + + KOKKOS_INLINE_FUNCTION + SegmentedView& operator= (const SegmentedView& src) { + segments_ = src.segments_; + realloc_lock = src.realloc_lock; + nsegments_ = src.nsegments_; + segment_length_= src.segment_length_; + segment_length_m1_= src.segment_length_m1_; + max_segments_ = src.max_segments_; + segment_length_log2= src.segment_length_log2; + m_offset_map = src.m_offset_map; + return *this; + } + + ~SegmentedView() { + if ( !segments_.tracker().ref_counting()) { return; } + size_t ref_count = segments_.tracker().ref_count(); + if(ref_count == 1u) { + Kokkos::fence(); + typename Kokkos::View<int,typename traits::execution_space>::HostMirror h_nviews("h_nviews"); + Kokkos::deep_copy(h_nviews,nsegments_); + Kokkos::parallel_for(h_nviews(),Impl::delete_segmented_view<DataType , Arg1Type , Arg2Type, Arg3Type>(*this)); + } + } + + KOKKOS_INLINE_FUNCTION + t_dev get_segment(const int& i) const { + return segments_[i]; + } + + template< class MemberType> + KOKKOS_INLINE_FUNCTION + void grow (MemberType& team_member, const size_t& growSize) const { + if (growSize>max_segments_*segment_length_) { + printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_); + return; + } + + if(team_member.team_rank()==0) { + bool too_small = growSize > segment_length_ * nsegments_(); + if (too_small) { + while(Kokkos::atomic_compare_exchange(&realloc_lock(),0,1) ) + ; // get the lock + too_small = growSize > segment_length_ * nsegments_(); // Recheck once we have the lock + if(too_small) { + while(too_small) { + const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3* + m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7; + typename traits::non_const_value_type* const ptr = new typename traits::non_const_value_type[alloc_size]; + + segments_(nsegments_()) = + t_dev(ptr,segment_length_,m_offset_map.N1,m_offset_map.N2,m_offset_map.N3,m_offset_map.N4,m_offset_map.N5,m_offset_map.N6,m_offset_map.N7); + nsegments_()++; + too_small = growSize > segment_length_ * nsegments_(); + } + } + realloc_lock() = 0; //release the lock + } + } + team_member.team_barrier(); + } + + KOKKOS_INLINE_FUNCTION + void grow_non_thread_safe (const size_t& growSize) const { + if (growSize>max_segments_*segment_length_) { + printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_); + return; + } + bool too_small = growSize > segment_length_ * nsegments_(); + if(too_small) { + while(too_small) { + const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3* + m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7; + typename traits::non_const_value_type* const ptr = + new typename traits::non_const_value_type[alloc_size]; + + segments_(nsegments_()) = + t_dev (ptr, segment_length_, m_offset_map.N1, m_offset_map.N2, + m_offset_map.N3, m_offset_map.N4, m_offset_map.N5, + m_offset_map.N6, m_offset_map.N7); + nsegments_()++; + too_small = growSize > segment_length_ * nsegments_(); + } + } + } + + template< typename iType0 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<iType0>::value && traits::rank == 1 ) + , typename traits::value_type & + >::type + operator() ( const iType0 & i0 ) const + { + return segments_[i0>>segment_length_log2](i0&(segment_length_m1_)); + } + + template< typename iType0 , typename iType1 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<iType0>::value && + std::is_integral<iType1>::value && + traits::rank == 2 ) + , typename traits::value_type & + >::type + operator() ( const iType0 & i0 , const iType1 & i1 ) const + { + return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1); + } + + template< typename iType0 , typename iType1 , typename iType2 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<iType0>::value && + std::is_integral<iType1>::value && + std::is_integral<iType2>::value && + traits::rank == 3 ) + , typename traits::value_type & + >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const + { + return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2); + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<iType0>::value && + std::is_integral<iType1>::value && + std::is_integral<iType2>::value && + std::is_integral<iType3>::value && + traits::rank == 4 ) + , typename traits::value_type & + >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const + { + return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3); + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<iType0>::value && + std::is_integral<iType1>::value && + std::is_integral<iType2>::value && + std::is_integral<iType3>::value && + std::is_integral<iType4>::value && + traits::rank == 5 ) + , typename traits::value_type & + >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 ) const + { + return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4); + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 , typename iType5 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<iType0>::value && + std::is_integral<iType1>::value && + std::is_integral<iType2>::value && + std::is_integral<iType3>::value && + std::is_integral<iType4>::value && + std::is_integral<iType5>::value && + traits::rank == 6 ) + , typename traits::value_type & + >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 , const iType5 & i5 ) const + { + return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5); + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 , typename iType5 , typename iType6 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<iType0>::value && + std::is_integral<iType1>::value && + std::is_integral<iType2>::value && + std::is_integral<iType3>::value && + std::is_integral<iType4>::value && + std::is_integral<iType5>::value && + std::is_integral<iType6>::value && + traits::rank == 7 ) + , typename traits::value_type & + >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const + { + return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6); + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 , typename iType5 , typename iType6 , typename iType7 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( std::is_integral<iType0>::value && + std::is_integral<iType1>::value && + std::is_integral<iType2>::value && + std::is_integral<iType3>::value && + std::is_integral<iType4>::value && + std::is_integral<iType5>::value && + std::is_integral<iType6>::value && + std::is_integral<iType7>::value && + traits::rank == 8 ) + , typename traits::value_type & + >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const + { + return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6,i7); + } +}; + +namespace Impl { +template<class DataType, class Arg1Type, class Arg2Type, class Arg3Type> +struct delete_segmented_view { + typedef SegmentedView<DataType , Arg1Type , Arg2Type, Arg3Type> view_type; + typedef typename view_type::execution_space execution_space; + + view_type view_; + delete_segmented_view(view_type view):view_(view) { + } + + KOKKOS_INLINE_FUNCTION + void operator() (int i) const { + delete [] view_.get_segment(i).ptr_on_device(); + } +}; + +} +} +} + +#endif + +#endif diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp new file mode 100755 index 0000000000000000000000000000000000000000..1ce38638a2b6a107d1439f7feebb0c90c4a8068f --- /dev/null +++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -0,0 +1,226 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_STATICCRSGRAPH_HPP +#define KOKKOS_STATICCRSGRAPH_HPP + +#include <string> +#include <vector> + +#include <Kokkos_Core.hpp> + +namespace Kokkos { + +/// \class StaticCrsGraph +/// \brief Compressed row storage array. +/// +/// \tparam DataType The type of stored entries. If a StaticCrsGraph is +/// used as the graph of a sparse matrix, then this is usually an +/// integer type, the type of the column indices in the sparse +/// matrix. +/// +/// \tparam Arg1Type The second template parameter, corresponding +/// either to the Device type (if there are no more template +/// parameters) or to the Layout type (if there is at least one more +/// template parameter). +/// +/// \tparam Arg2Type The third template parameter, which if provided +/// corresponds to the Device type. +/// +/// \tparam SizeType The type of row offsets. Usually the default +/// parameter suffices. However, setting a nondefault value is +/// necessary in some cases, for example, if you want to have a +/// sparse matrices with dimensions (and therefore column indices) +/// that fit in \c int, but want to store more than <tt>INT_MAX</tt> +/// entries in the sparse matrix. +/// +/// A row has a range of entries: +/// <ul> +/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li> +/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li> +/// <li> <tt> entries( entry , i2 , i3 , ... ); </tt> </li> +/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li> +/// </ul> +template< class DataType, + class Arg1Type, + class Arg2Type = void, + typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type> +class StaticCrsGraph { +private: + typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits; + +public: + typedef DataType data_type; + typedef typename traits::array_layout array_layout; + typedef typename traits::execution_space execution_space; + typedef typename traits::device_type device_type; + typedef SizeType size_type; + + typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type; + typedef StaticCrsGraph< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror; + typedef View< const size_type* , array_layout, device_type > row_map_type; + typedef View< DataType* , array_layout, device_type > entries_type; + + entries_type entries; + row_map_type row_map; + + //! Construct an empty view. + StaticCrsGraph () : entries(), row_map() {} + + //! Copy constructor (shallow copy). + StaticCrsGraph (const StaticCrsGraph& rhs) : entries (rhs.entries), row_map (rhs.row_map) + {} + + template<class EntriesType, class RowMapType> + StaticCrsGraph (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_) + {} + + /** \brief Assign to a view of the rhs array. + * If the old view is the last view + * then allocated memory is deallocated. + */ + StaticCrsGraph& operator= (const StaticCrsGraph& rhs) { + entries = rhs.entries; + row_map = rhs.row_map; + return *this; + } + + /** \brief Destroy this view of the array. + * If the last view then allocated memory is deallocated. + */ + ~StaticCrsGraph() {} + + KOKKOS_INLINE_FUNCTION + size_type numRows() const { + return (row_map.dimension_0 () != 0) ? + row_map.dimension_0 () - static_cast<size_type> (1) : + static_cast<size_type> (0); + } +}; + +//---------------------------------------------------------------------------- + +template< class StaticCrsGraphType , class InputSizeType > +typename StaticCrsGraphType::staticcrsgraph_type +create_staticcrsgraph( const std::string & label , + const std::vector< InputSizeType > & input ); + +template< class StaticCrsGraphType , class InputSizeType > +typename StaticCrsGraphType::staticcrsgraph_type +create_staticcrsgraph( const std::string & label , + const std::vector< std::vector< InputSizeType > > & input ); + +//---------------------------------------------------------------------------- + +template< class DataType , + class Arg1Type , + class Arg2Type , + typename SizeType > +typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror +create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input ); + +template< class DataType , + class Arg1Type , + class Arg2Type , + typename SizeType > +typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror +create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input ); + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include <impl/Kokkos_StaticCrsGraph_factory.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class GraphType > +struct StaticCrsGraphMaximumEntry { + + typedef typename GraphType::execution_space execution_space ; + typedef typename GraphType::data_type value_type ; + + const typename GraphType::entries_type entries ; + + StaticCrsGraphMaximumEntry( const GraphType & graph ) : entries( graph.entries ) {} + + KOKKOS_INLINE_FUNCTION + void operator()( const unsigned i , value_type & update ) const + { if ( update < entries(i) ) update = entries(i); } + + KOKKOS_INLINE_FUNCTION + void init( value_type & update ) const + { update = 0 ; } + + KOKKOS_INLINE_FUNCTION + void join( volatile value_type & update , + volatile const value_type & input ) const + { if ( update < input ) update = input ; } +}; + +} + +template< class DataType, class Arg1Type, class Arg2Type, typename SizeType > +DataType maximum_entry( const StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > & graph ) +{ + typedef StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType> GraphType ; + typedef Impl::StaticCrsGraphMaximumEntry< GraphType > FunctorType ; + + DataType result = 0 ; + Kokkos::parallel_reduce( graph.entries.dimension_0(), + FunctorType(graph), result ); + return result ; +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_CRSARRAY_HPP */ + diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp new file mode 100755 index 0000000000000000000000000000000000000000..7a916c6ef7c449a041d6d2014033e34c3342f185 --- /dev/null +++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -0,0 +1,848 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_UnorderedMap.hpp +/// \brief Declaration and definition of Kokkos::UnorderedMap. +/// +/// This header file declares and defines Kokkos::UnorderedMap and its +/// related nonmember functions. + +#ifndef KOKKOS_UNORDERED_MAP_HPP +#define KOKKOS_UNORDERED_MAP_HPP + +#include <Kokkos_Core.hpp> +#include <Kokkos_Functional.hpp> + +#include <Kokkos_Bitset.hpp> + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_UnorderedMap_impl.hpp> + + +#include <iostream> + +#include <stdint.h> +#include <stdexcept> + + +namespace Kokkos { + +enum { UnorderedMapInvalidIndex = ~0u }; + +/// \brief First element of the return value of UnorderedMap::insert(). +/// +/// Inserting an element into an UnorderedMap is not guaranteed to +/// succeed. There are three possible conditions: +/// <ol> +/// <li> <tt>INSERT_FAILED</tt>: The insert failed. This usually +/// means that the UnorderedMap ran out of space. </li> +/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key +/// did <i>not</i> exist in the table before. </li> +/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key +/// <i>did</i> exist in the table before. The new value was +/// ignored and the old value was left in place. </li> +/// </ol> + +class UnorderedMapInsertResult +{ +private: + enum Status{ + SUCCESS = 1u << 31 + , EXISTING = 1u << 30 + , FREED_EXISTING = 1u << 29 + , LIST_LENGTH_MASK = ~(SUCCESS | EXISTING | FREED_EXISTING) + }; + +public: + /// Did the map successful insert the key/value pair + KOKKOS_FORCEINLINE_FUNCTION + bool success() const { return (m_status & SUCCESS); } + + /// Was the key already present in the map + KOKKOS_FORCEINLINE_FUNCTION + bool existing() const { return (m_status & EXISTING); } + + /// Did the map fail to insert the key due to insufficent capacity + KOKKOS_FORCEINLINE_FUNCTION + bool failed() const { return m_index == UnorderedMapInvalidIndex; } + + /// Did the map lose a race condition to insert a dupulicate key/value pair + /// where an index was claimed that needed to be released + KOKKOS_FORCEINLINE_FUNCTION + bool freed_existing() const { return (m_status & FREED_EXISTING); } + + /// How many iterations through the insert loop did it take before the + /// map returned + KOKKOS_FORCEINLINE_FUNCTION + uint32_t list_position() const { return (m_status & LIST_LENGTH_MASK); } + + /// Index where the key can be found as long as the insert did not fail + KOKKOS_FORCEINLINE_FUNCTION + uint32_t index() const { return m_index; } + + KOKKOS_FORCEINLINE_FUNCTION + UnorderedMapInsertResult() + : m_index(UnorderedMapInvalidIndex) + , m_status(0) + {} + + KOKKOS_FORCEINLINE_FUNCTION + void increment_list_position() + { + m_status += (list_position() < LIST_LENGTH_MASK) ? 1u : 0u; + } + + KOKKOS_FORCEINLINE_FUNCTION + void set_existing(uint32_t i, bool arg_freed_existing) + { + m_index = i; + m_status = EXISTING | (arg_freed_existing ? FREED_EXISTING : 0u) | list_position(); + } + + KOKKOS_FORCEINLINE_FUNCTION + void set_success(uint32_t i) + { + m_index = i; + m_status = SUCCESS | list_position(); + } + +private: + uint32_t m_index; + uint32_t m_status; +}; + +/// \class UnorderedMap +/// \brief Thread-safe, performance-portable lookup table. +/// +/// This class provides a lookup table. In terms of functionality, +/// this class compares to std::unordered_map (new in C++11). +/// "Unordered" means that keys are not stored in any particular +/// order, unlike (for example) std::map. "Thread-safe" means that +/// lookups, insertion, and deletion are safe to call by multiple +/// threads in parallel. "Performance-portable" means that parallel +/// performance of these operations is reasonable, on multiple +/// hardware platforms. Platforms on which performance has been +/// tested include conventional Intel x86 multicore processors, Intel +/// Xeon Phi ("MIC"), and NVIDIA GPUs. +/// +/// Parallel performance portability entails design decisions that +/// might differ from one's expectation for a sequential interface. +/// This particularly affects insertion of single elements. In an +/// interface intended for sequential use, insertion might reallocate +/// memory if the original allocation did not suffice to hold the new +/// element. In this class, insertion does <i>not</i> reallocate +/// memory. This means that it might fail. insert() returns an enum +/// which indicates whether the insert failed. There are three +/// possible conditions: +/// <ol> +/// <li> <tt>INSERT_FAILED</tt>: The insert failed. This usually +/// means that the UnorderedMap ran out of space. </li> +/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key +/// did <i>not</i> exist in the table before. </li> +/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key +/// <i>did</i> exist in the table before. The new value was +/// ignored and the old value was left in place. </li> +/// </ol> +/// +/// \tparam Key Type of keys of the lookup table. If \c const, users +/// are not allowed to add or remove keys, though they are allowed +/// to change values. In that case, the implementation may make +/// optimizations specific to the <tt>Device</tt>. For example, if +/// <tt>Device</tt> is \c Cuda, it may use texture fetches to access +/// keys. +/// +/// \tparam Value Type of values stored in the lookup table. You may use +/// \c void here, in which case the table will be a set of keys. If +/// \c const, users are not allowed to change entries. +/// In that case, the implementation may make +/// optimizations specific to the \c Device, such as using texture +/// fetches to access values. +/// +/// \tparam Device The Kokkos Device type. +/// +/// \tparam Hasher Definition of the hash function for instances of +/// <tt>Key</tt>. The default will calculate a bitwise hash. +/// +/// \tparam EqualTo Definition of the equality function for instances of +/// <tt>Key</tt>. The default will do a bitwise equality comparison. +/// +template < typename Key + , typename Value + , typename Device = Kokkos::DefaultExecutionSpace + , typename Hasher = pod_hash<typename Impl::remove_const<Key>::type> + , typename EqualTo = pod_equal_to<typename Impl::remove_const<Key>::type> + > +class UnorderedMap +{ +private: + typedef typename ViewTraits<Key,Device,void,void>::host_mirror_space host_mirror_space ; +public: + //! \name Public types and constants + //@{ + + //key_types + typedef Key declared_key_type; + typedef typename Impl::remove_const<declared_key_type>::type key_type; + typedef typename Impl::add_const<key_type>::type const_key_type; + + //value_types + typedef Value declared_value_type; + typedef typename Impl::remove_const<declared_value_type>::type value_type; + typedef typename Impl::add_const<value_type>::type const_value_type; + + typedef Device execution_space; + typedef Hasher hasher_type; + typedef EqualTo equal_to_type; + typedef uint32_t size_type; + + //map_types + typedef UnorderedMap<declared_key_type,declared_value_type,execution_space,hasher_type,equal_to_type> declared_map_type; + typedef UnorderedMap<key_type,value_type,execution_space,hasher_type,equal_to_type> insertable_map_type; + typedef UnorderedMap<const_key_type,value_type,execution_space,hasher_type,equal_to_type> modifiable_map_type; + typedef UnorderedMap<const_key_type,const_value_type,execution_space,hasher_type,equal_to_type> const_map_type; + + static const bool is_set = Impl::is_same<void,value_type>::value; + static const bool has_const_key = Impl::is_same<const_key_type,declared_key_type>::value; + static const bool has_const_value = is_set || Impl::is_same<const_value_type,declared_value_type>::value; + + static const bool is_insertable_map = !has_const_key && (is_set || !has_const_value); + static const bool is_modifiable_map = has_const_key && !has_const_value; + static const bool is_const_map = has_const_key && has_const_value; + + + typedef UnorderedMapInsertResult insert_result; + + typedef UnorderedMap<Key,Value,host_mirror_space,Hasher,EqualTo> HostMirror; + + typedef Impl::UnorderedMapHistogram<const_map_type> histogram_type; + + //@} + +private: + enum { invalid_index = ~static_cast<size_type>(0) }; + + typedef typename Impl::if_c< is_set, int, declared_value_type>::type impl_value_type; + + typedef typename Impl::if_c< is_insertable_map + , View< key_type *, execution_space> + , View< const key_type *, execution_space, MemoryTraits<RandomAccess> > + >::type key_type_view; + + typedef typename Impl::if_c< is_insertable_map || is_modifiable_map + , View< impl_value_type *, execution_space> + , View< const impl_value_type *, execution_space, MemoryTraits<RandomAccess> > + >::type value_type_view; + + typedef typename Impl::if_c< is_insertable_map + , View< size_type *, execution_space> + , View< const size_type *, execution_space, MemoryTraits<RandomAccess> > + >::type size_type_view; + + typedef typename Impl::if_c< is_insertable_map + , Bitset< execution_space > + , ConstBitset< execution_space> + >::type bitset_type; + + enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 }; + enum { num_scalars = 3 }; + typedef View< int[num_scalars], LayoutLeft, execution_space> scalars_view; + +public: + //! \name Public member functions + //@{ + + UnorderedMap() + : m_bounded_insert() + , m_hasher() + , m_equal_to() + , m_size() + , m_available_indexes() + , m_hash_lists() + , m_next_index() + , m_keys() + , m_values() + , m_scalars() + {} + + /// \brief Constructor + /// + /// \param capacity_hint [in] Initial guess of how many unique keys will be inserted into the map + /// \param hash [in] Hasher function for \c Key instances. The + /// default value usually suffices. + UnorderedMap( size_type capacity_hint, hasher_type hasher = hasher_type(), equal_to_type equal_to = equal_to_type() ) + : m_bounded_insert(true) + , m_hasher(hasher) + , m_equal_to(equal_to) + , m_size() + , m_available_indexes(calculate_capacity(capacity_hint)) + , m_hash_lists(ViewAllocateWithoutInitializing("UnorderedMap hash list"), Impl::find_hash_size(capacity())) + , m_next_index(ViewAllocateWithoutInitializing("UnorderedMap next index"), capacity()+1) // +1 so that the *_at functions can always return a valid reference + , m_keys("UnorderedMap keys",capacity()+1) + , m_values("UnorderedMap values",(is_set? 1 : capacity()+1)) + , m_scalars("UnorderedMap scalars") + { + if (!is_insertable_map) { + throw std::runtime_error("Cannot construct a non-insertable (i.e. const key_type) unordered_map"); + } + + Kokkos::deep_copy(m_hash_lists, invalid_index); + Kokkos::deep_copy(m_next_index, invalid_index); + } + + void reset_failed_insert_flag() + { + reset_flag(failed_insert_idx); + } + + histogram_type get_histogram() + { + return histogram_type(*this); + } + + //! Clear all entries in the table. + void clear() + { + m_bounded_insert = true; + + if (capacity() == 0) return; + + m_available_indexes.clear(); + + Kokkos::deep_copy(m_hash_lists, invalid_index); + Kokkos::deep_copy(m_next_index, invalid_index); + { + const key_type tmp = key_type(); + Kokkos::deep_copy(m_keys,tmp); + } + if (is_set){ + const impl_value_type tmp = impl_value_type(); + Kokkos::deep_copy(m_values,tmp); + } + { + Kokkos::deep_copy(m_scalars, 0); + } + } + + /// \brief Change the capacity of the the map + /// + /// If there are no failed inserts the current size of the map will + /// be used as a lower bound for the input capacity. + /// If the map is not empty and does not have failed inserts + /// and the capacity changes then the current data is copied + /// into the resized / rehashed map. + /// + /// This is <i>not</i> a device function; it may <i>not</i> be + /// called in a parallel kernel. + bool rehash(size_type requested_capacity = 0) + { + const bool bounded_insert = (capacity() == 0) || (size() == 0u); + return rehash(requested_capacity, bounded_insert ); + } + + bool rehash(size_type requested_capacity, bool bounded_insert) + { + if(!is_insertable_map) return false; + + const size_type curr_size = size(); + requested_capacity = (requested_capacity < curr_size) ? curr_size : requested_capacity; + + insertable_map_type tmp(requested_capacity, m_hasher, m_equal_to); + + if (curr_size) { + tmp.m_bounded_insert = false; + Impl::UnorderedMapRehash<insertable_map_type> f(tmp,*this); + f.apply(); + } + tmp.m_bounded_insert = bounded_insert; + + *this = tmp; + + return true; + } + + /// \brief The number of entries in the table. + /// + /// This method has undefined behavior when erasable() is true. + /// + /// Note that this is not a device function; it cannot be called in + /// a parallel kernel. The value is not stored as a variable; it + /// must be computed. + size_type size() const + { + if( capacity() == 0u ) return 0u; + if (modified()) { + m_size = m_available_indexes.count(); + reset_flag(modified_idx); + } + return m_size; + } + + /// \brief The current number of failed insert() calls. + /// + /// This is <i>not</i> a device function; it may <i>not</i> be + /// called in a parallel kernel. The value is not stored as a + /// variable; it must be computed. + bool failed_insert() const + { + return get_flag(failed_insert_idx); + } + + bool erasable() const + { + return is_insertable_map ? get_flag(erasable_idx) : false; + } + + bool begin_erase() + { + bool result = !erasable(); + if (is_insertable_map && result) { + execution_space::fence(); + set_flag(erasable_idx); + execution_space::fence(); + } + return result; + } + + bool end_erase() + { + bool result = erasable(); + if (is_insertable_map && result) { + execution_space::fence(); + Impl::UnorderedMapErase<declared_map_type> f(*this); + f.apply(); + execution_space::fence(); + reset_flag(erasable_idx); + } + return result; + } + + /// \brief The maximum number of entries that the table can hold. + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + KOKKOS_FORCEINLINE_FUNCTION + size_type capacity() const + { return m_available_indexes.size(); } + + /// \brief The number of hash table "buckets." + /// + /// This is different than the number of entries that the table can + /// hold. Each key hashes to an index in [0, hash_capacity() - 1]. + /// That index can hold zero or more entries. This class decides + /// what hash_capacity() should be, given the user's upper bound on + /// the number of entries the table must be able to hold. + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + KOKKOS_INLINE_FUNCTION + size_type hash_capacity() const + { return m_hash_lists.dimension_0(); } + + //--------------------------------------------------------------------------- + //--------------------------------------------------------------------------- + + + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. As discussed in the class documentation, it need not + /// succeed. The return value tells you if it did. + /// + /// \param k [in] The key to attempt to insert. + /// \param v [in] The corresponding value to attempt to insert. If + /// using this class as a set (with Value = void), then you need not + /// provide this value. + KOKKOS_INLINE_FUNCTION + insert_result insert(key_type const& k, impl_value_type const&v = impl_value_type()) const + { + insert_result result; + + if ( !is_insertable_map || capacity() == 0u || m_scalars((int)erasable_idx) ) { + return result; + } + + if ( !m_scalars((int)modified_idx) ) { + m_scalars((int)modified_idx) = true; + } + + int volatile & failed_insert_ref = m_scalars((int)failed_insert_idx) ; + + const size_type hash_value = m_hasher(k); + const size_type hash_list = hash_value % m_hash_lists.dimension_0(); + + size_type * curr_ptr = & m_hash_lists[ hash_list ]; + size_type new_index = invalid_index ; + + // Force integer multiply to long + size_type index_hint = static_cast<size_type>( (static_cast<double>(hash_list) * capacity()) / m_hash_lists.dimension_0()); + + size_type find_attempts = 0; + + enum { bounded_find_attempts = 32u }; + const size_type max_attempts = (m_bounded_insert && (bounded_find_attempts < m_available_indexes.max_hint()) ) ? + bounded_find_attempts : + m_available_indexes.max_hint(); + + bool not_done = true ; + +#if defined( __MIC__ ) + #pragma noprefetch +#endif + while ( not_done ) { + + // Continue searching the unordered list for this key, + // list will only be appended during insert phase. + // Need volatile_load as other threads may be appending. + size_type curr = volatile_load(curr_ptr); + + KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]); +#if defined( __MIC__ ) + #pragma noprefetch +#endif + while ( curr != invalid_index && ! m_equal_to( volatile_load(&m_keys[curr]), k) ) { + result.increment_list_position(); + index_hint = curr; + curr_ptr = &m_next_index[curr]; + curr = volatile_load(curr_ptr); + KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]); + } + + //------------------------------------------------------------ + // If key already present then return that index. + if ( curr != invalid_index ) { + + const bool free_existing = new_index != invalid_index; + if ( free_existing ) { + // Previously claimed an unused entry that was not inserted. + // Release this unused entry immediately. + if (!m_available_indexes.reset(new_index) ) { + printf("Unable to free existing\n"); + } + + } + + result.set_existing(curr, free_existing); + not_done = false ; + } + //------------------------------------------------------------ + // Key is not currently in the map. + // If the thread has claimed an entry try to insert now. + else { + + //------------------------------------------------------------ + // If have not already claimed an unused entry then do so now. + if (new_index == invalid_index) { + + bool found = false; + // use the hash_list as the flag for the search direction + Kokkos::tie(found, index_hint) = m_available_indexes.find_any_unset_near( index_hint, hash_list ); + + // found and index and this thread set it + if ( !found && ++find_attempts >= max_attempts ) { + failed_insert_ref = true; + not_done = false ; + } + else if (m_available_indexes.set(index_hint) ) { + new_index = index_hint; + // Set key and value + KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_keys[new_index]); + m_keys[new_index] = k ; + + if (!is_set) { + KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_values[new_index]); + m_values[new_index] = v ; + } + + // Do not proceed until key and value are updated in global memory + memory_fence(); + } + } + else if (failed_insert_ref) { + not_done = false; + } + + // Attempt to append claimed entry into the list. + // Another thread may also be trying to append the same list so protect with atomic. + if ( new_index != invalid_index && + curr == atomic_compare_exchange(curr_ptr, static_cast<size_type>(invalid_index), new_index) ) { + // Succeeded in appending + result.set_success(new_index); + not_done = false ; + } + } + } // while ( not_done ) + + return result ; + } + + KOKKOS_INLINE_FUNCTION + bool erase(key_type const& k) const + { + bool result = false; + + if(is_insertable_map && 0u < capacity() && m_scalars((int)erasable_idx)) { + + if ( ! m_scalars((int)modified_idx) ) { + m_scalars((int)modified_idx) = true; + } + + size_type index = find(k); + if (valid_at(index)) { + m_available_indexes.reset(index); + result = true; + } + } + + return result; + } + + /// \brief Find the given key \c k, if it exists in the table. + /// + /// \return If the key exists in the table, the index of the + /// value corresponding to that key; otherwise, an invalid index. + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + KOKKOS_INLINE_FUNCTION + size_type find( const key_type & k) const + { + size_type curr = 0u < capacity() ? m_hash_lists( m_hasher(k) % m_hash_lists.dimension_0() ) : invalid_index ; + + KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]); + while (curr != invalid_index && !m_equal_to( m_keys[curr], k) ) { + KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]); + curr = m_next_index[curr]; + } + + return curr; + } + + /// \brief Does the key exist in the map + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + KOKKOS_INLINE_FUNCTION + bool exists( const key_type & k) const + { + return valid_at(find(k)); + } + + + /// \brief Get the value with \c i as its direct index. + /// + /// \param i [in] Index directly into the array of entries. + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + /// + /// 'const value_type' via Cuda texture fetch must return by value. + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::if_c< (is_set || has_const_value), impl_value_type, impl_value_type &>::type + value_at(size_type i) const + { + return m_values[ is_set ? 0 : (i < capacity() ? i : capacity()) ]; + } + + /// \brief Get the key with \c i as its direct index. + /// + /// \param i [in] Index directly into the array of entries. + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + KOKKOS_FORCEINLINE_FUNCTION + key_type key_at(size_type i) const + { + return m_keys[ i < capacity() ? i : capacity() ]; + } + + KOKKOS_FORCEINLINE_FUNCTION + bool valid_at(size_type i) const + { + return m_available_indexes.test(i); + } + + template <typename SKey, typename SValue> + UnorderedMap( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src, + typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value,int>::type = 0 + ) + : m_bounded_insert(src.m_bounded_insert) + , m_hasher(src.m_hasher) + , m_equal_to(src.m_equal_to) + , m_size(src.m_size) + , m_available_indexes(src.m_available_indexes) + , m_hash_lists(src.m_hash_lists) + , m_next_index(src.m_next_index) + , m_keys(src.m_keys) + , m_values(src.m_values) + , m_scalars(src.m_scalars) + {} + + + template <typename SKey, typename SValue> + typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value + ,declared_map_type & >::type + operator=( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src) + { + m_bounded_insert = src.m_bounded_insert; + m_hasher = src.m_hasher; + m_equal_to = src.m_equal_to; + m_size = src.m_size; + m_available_indexes = src.m_available_indexes; + m_hash_lists = src.m_hash_lists; + m_next_index = src.m_next_index; + m_keys = src.m_keys; + m_values = src.m_values; + m_scalars = src.m_scalars; + return *this; + } + + template <typename SKey, typename SValue, typename SDevice> + typename Impl::enable_if< Impl::is_same< typename Impl::remove_const<SKey>::type, key_type>::value && + Impl::is_same< typename Impl::remove_const<SValue>::type, value_type>::value + >::type + create_copy_view( UnorderedMap<SKey, SValue, SDevice, Hasher,EqualTo> const& src) + { + if (m_hash_lists.ptr_on_device() != src.m_hash_lists.ptr_on_device()) { + + insertable_map_type tmp; + + tmp.m_bounded_insert = src.m_bounded_insert; + tmp.m_hasher = src.m_hasher; + tmp.m_equal_to = src.m_equal_to; + tmp.m_size = src.size(); + tmp.m_available_indexes = bitset_type( src.capacity() ); + tmp.m_hash_lists = size_type_view( ViewAllocateWithoutInitializing("UnorderedMap hash list"), src.m_hash_lists.dimension_0() ); + tmp.m_next_index = size_type_view( ViewAllocateWithoutInitializing("UnorderedMap next index"), src.m_next_index.dimension_0() ); + tmp.m_keys = key_type_view( ViewAllocateWithoutInitializing("UnorderedMap keys"), src.m_keys.dimension_0() ); + tmp.m_values = value_type_view( ViewAllocateWithoutInitializing("UnorderedMap values"), src.m_values.dimension_0() ); + tmp.m_scalars = scalars_view("UnorderedMap scalars"); + + Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes); + + typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, typename SDevice::memory_space > raw_deep_copy; + + raw_deep_copy(tmp.m_hash_lists.ptr_on_device(), src.m_hash_lists.ptr_on_device(), sizeof(size_type)*src.m_hash_lists.dimension_0()); + raw_deep_copy(tmp.m_next_index.ptr_on_device(), src.m_next_index.ptr_on_device(), sizeof(size_type)*src.m_next_index.dimension_0()); + raw_deep_copy(tmp.m_keys.ptr_on_device(), src.m_keys.ptr_on_device(), sizeof(key_type)*src.m_keys.dimension_0()); + if (!is_set) { + raw_deep_copy(tmp.m_values.ptr_on_device(), src.m_values.ptr_on_device(), sizeof(impl_value_type)*src.m_values.dimension_0()); + } + raw_deep_copy(tmp.m_scalars.ptr_on_device(), src.m_scalars.ptr_on_device(), sizeof(int)*num_scalars ); + + *this = tmp; + } + } + + //@} +private: // private member functions + + bool modified() const + { + return get_flag(modified_idx); + } + + void set_flag(int flag) const + { + typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy; + const int true_ = true; + raw_deep_copy(m_scalars.ptr_on_device() + flag, &true_, sizeof(int)); + } + + void reset_flag(int flag) const + { + typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy; + const int false_ = false; + raw_deep_copy(m_scalars.ptr_on_device() + flag, &false_, sizeof(int)); + } + + bool get_flag(int flag) const + { + typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename execution_space::memory_space > raw_deep_copy; + int result = false; + raw_deep_copy(&result, m_scalars.ptr_on_device() + flag, sizeof(int)); + return result; + } + + static uint32_t calculate_capacity(uint32_t capacity_hint) + { + // increase by 16% and round to nears multiple of 128 + return capacity_hint ? ((static_cast<uint32_t>(7ull*capacity_hint/6u) + 127u)/128u)*128u : 128u; + } + +private: // private members + bool m_bounded_insert; + hasher_type m_hasher; + equal_to_type m_equal_to; + mutable size_type m_size; + bitset_type m_available_indexes; + size_type_view m_hash_lists; + size_type_view m_next_index; + key_type_view m_keys; + value_type_view m_values; + scalars_view m_scalars; + + template <typename KKey, typename VValue, typename DDevice, typename HHash, typename EEqualTo> + friend class UnorderedMap; + + template <typename UMap> + friend struct Impl::UnorderedMapErase; + + template <typename UMap> + friend struct Impl::UnorderedMapHistogram; + + template <typename UMap> + friend struct Impl::UnorderedMapPrint; +}; + +// Specialization of deep_copy for two UnorderedMap objects. +template < typename DKey, typename DT, typename DDevice + , typename SKey, typename ST, typename SDevice + , typename Hasher, typename EqualTo > +inline void deep_copy( UnorderedMap<DKey, DT, DDevice, Hasher, EqualTo> & dst + , const UnorderedMap<SKey, ST, SDevice, Hasher, EqualTo> & src ) +{ + dst.create_copy_view(src); +} + + +} // namespace Kokkos + +#endif //KOKKOS_UNORDERED_MAP_HPP diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp new file mode 100755 index 0000000000000000000000000000000000000000..db54b0c350ff18cc524066d52325fbca8d8701be --- /dev/null +++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp @@ -0,0 +1,287 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_VECTOR_HPP +#define KOKKOS_VECTOR_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_DualView.hpp> + +/* Drop in replacement for std::vector based on Kokkos::DualView + * Most functions only work on the host (it will not compile if called from device kernel) + * + */ + namespace Kokkos { + +template <typename Scalar, class Space = Kokkos::DefaultExecutionSpace > +class vector : public DualView<Scalar*,LayoutLeft,Space> { +public: + typedef typename Space::memory_space memory_space; + typedef typename Space::execution_space execution_space; + typedef typename Kokkos::Device<execution_space,memory_space> device_type; + + typedef Scalar value_type; + typedef Scalar* pointer; + typedef const Scalar* const_pointer; + typedef Scalar* reference; + typedef const Scalar* const_reference; + typedef Scalar* iterator; + typedef const Scalar* const_iterator; + +private: + size_t _size; + typedef size_t size_type; + float _extra_storage; + typedef DualView<Scalar*,LayoutLeft,Space> DV; + + +public: +#ifdef KOKKOS_CUDA_USE_UVM + KOKKOS_INLINE_FUNCTION Scalar& operator() (int i) const {return DV::h_view(i);}; + KOKKOS_INLINE_FUNCTION Scalar& operator[] (int i) const {return DV::h_view(i);}; +#else + inline Scalar& operator() (int i) const {return DV::h_view(i);}; + inline Scalar& operator[] (int i) const {return DV::h_view(i);}; +#endif + + /* Member functions which behave like std::vector functions */ + + vector():DV() { + _size = 0; + _extra_storage = 1.1; + DV::modified_host() = 1; + }; + + + vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Space>("Vector",size_t(n*(1.1))) { + _size = n; + _extra_storage = 1.1; + DV::modified_host() = 1; + + assign(n,val); + } + + + void resize(size_t n) { + if(n>=capacity()) + DV::resize(size_t (n*_extra_storage)); + _size = n; + } + + void resize(size_t n, const Scalar& val) { + assign(n,val); + } + + void assign (size_t n, const Scalar& val) { + + /* Resize if necessary (behavour of std:vector) */ + + if(n>capacity()) + DV::resize(size_t (n*_extra_storage)); + _size = n; + + /* Assign value either on host or on device */ + + if( DV::modified_host() >= DV::modified_device() ) { + set_functor_host f(DV::h_view,val); + parallel_for(n,f); + DV::t_host::execution_space::fence(); + DV::modified_host()++; + } else { + set_functor f(DV::d_view,val); + parallel_for(n,f); + DV::t_dev::execution_space::fence(); + DV::modified_device()++; + } + } + + void reserve(size_t n) { + DV::resize(size_t (n*_extra_storage)); + } + + void push_back(Scalar val) { + DV::modified_host()++; + if(_size == capacity()) { + size_t new_size = _size*_extra_storage; + if(new_size == _size) new_size++; + DV::resize(new_size); + } + + DV::h_view(_size) = val; + _size++; + + }; + + void pop_back() { + _size--; + }; + + void clear() { + _size = 0; + } + + size_type size() const {return _size;}; + size_type max_size() const {return 2000000000;} + size_type capacity() const {return DV::capacity();}; + bool empty() const {return _size==0;}; + + iterator begin() const {return &DV::h_view(0);}; + + iterator end() const {return &DV::h_view(_size);}; + + + /* std::algorithms wich work originally with iterators, here they are implemented as member functions */ + + size_t + lower_bound (const size_t& start, + const size_t& theEnd, + const Scalar& comp_val) const + { + int lower = start; // FIXME (mfh 24 Apr 2014) narrowing conversion + int upper = _size > theEnd? theEnd : _size-1; // FIXME (mfh 24 Apr 2014) narrowing conversion + if (upper <= lower) { + return theEnd; + } + + Scalar lower_val = DV::h_view(lower); + Scalar upper_val = DV::h_view(upper); + size_t idx = (upper+lower)/2; + Scalar val = DV::h_view(idx); + if(val>upper_val) return upper; + if(val<lower_val) return start; + + while(upper>lower) { + if(comp_val>val) { + lower = ++idx; + } else { + upper = idx; + } + idx = (upper+lower)/2; + val = DV::h_view(idx); + } + return idx; + } + + bool is_sorted() { + for(int i=0;i<_size-1;i++) { + if(DV::h_view(i)>DV::h_view(i+1)) return false; + } + return true; + } + + iterator find(Scalar val) const { + if(_size == 0) return end(); + + int upper,lower,current; + current = _size/2; + upper = _size-1; + lower = 0; + + if((val<DV::h_view(0)) || (val>DV::h_view(_size-1)) ) return end(); + + while(upper>lower) + { + if(val>DV::h_view(current)) lower = current+1; + else upper = current; + current = (upper+lower)/2; + } + + if(val==DV::h_view(current)) return &DV::h_view(current); + else return end(); + } + + /* Additional functions for data management */ + + void device_to_host(){ + deep_copy(DV::h_view,DV::d_view); + } + void host_to_device() const { + deep_copy(DV::d_view,DV::h_view); + } + + void on_host() { + DV::modified_host() = DV::modified_device() + 1; + } + void on_device() { + DV::modified_device() = DV::modified_host() + 1; + } + + void set_overallocation(float extra) { + _extra_storage = 1.0 + extra; + } + + +public: + struct set_functor { + typedef typename DV::t_dev::execution_space execution_space; + typename DV::t_dev _data; + Scalar _val; + + set_functor(typename DV::t_dev data, Scalar val) : + _data(data),_val(val) {} + + KOKKOS_INLINE_FUNCTION + void operator() (const int &i) const { + _data(i) = _val; + } + }; + + struct set_functor_host { + typedef typename DV::t_host::execution_space execution_space; + typename DV::t_host _data; + Scalar _val; + + set_functor_host(typename DV::t_host data, Scalar val) : + _data(data),_val(val) {} + + KOKKOS_INLINE_FUNCTION + void operator() (const int &i) const { + _data(i) = _val; + } + }; + +}; + + +} +#endif diff --git a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp new file mode 100755 index 0000000000000000000000000000000000000000..7de290e71138d5660563d5ab27fc0c86ef27762e --- /dev/null +++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp @@ -0,0 +1,173 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_BITSET_IMPL_HPP +#define KOKKOS_BITSET_IMPL_HPP + +#include <Kokkos_Macros.hpp> +#include <stdint.h> + +#include <cstdio> +#include <climits> +#include <iostream> +#include <iomanip> + +namespace Kokkos { namespace Impl { + +KOKKOS_FORCEINLINE_FUNCTION +unsigned rotate_right(unsigned i, int r) +{ + enum { size = static_cast<int>(sizeof(unsigned)*CHAR_BIT) }; + return r ? ((i >> r) | (i << (size-r))) : i ; +} + +KOKKOS_FORCEINLINE_FUNCTION +int bit_scan_forward(unsigned i) +{ +#if defined( __CUDA_ARCH__ ) + return __ffs(i) - 1; +#elif defined( __GNUC__ ) || defined( __GNUG__ ) + return __builtin_ffs(i) - 1; +#elif defined( __INTEL_COMPILER ) + return _bit_scan_forward(i); +#else + + unsigned t = 1u; + int r = 0; + while (i && (i & t == 0)) + { + t = t << 1; + ++r; + } + return r; +#endif +} + + +KOKKOS_FORCEINLINE_FUNCTION +int bit_scan_reverse(unsigned i) +{ + enum { shift = static_cast<int>(sizeof(unsigned)*CHAR_BIT - 1) }; +#if defined( __CUDA_ARCH__ ) + return shift - __clz(i); +#elif defined( __GNUC__ ) || defined( __GNUG__ ) + return shift - __builtin_clz(i); +#elif defined( __INTEL_COMPILER ) + return _bit_scan_reverse(i); +#else + unsigned t = 1u << shift; + int r = 0; + while (i && (i & t == 0)) + { + t = t >> 1; + ++r; + } + return r; +#endif +} + + +// count the bits set +KOKKOS_FORCEINLINE_FUNCTION +int popcount(unsigned i) +{ +#if defined( __CUDA_ARCH__ ) + return __popc(i); +#elif defined( __GNUC__ ) || defined( __GNUG__ ) + return __builtin_popcount(i); +#elif defined ( __INTEL_COMPILER ) + return _popcnt32(i); +#else + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive + i = i - ((i >> 1) & ~0u/3u); // temp + i = (i & ~0u/15u*3u) + ((i >> 2) & ~0u/15u*3u); // temp + i = (i + (i >> 4)) & ~0u/255u*15u; // temp + return (int)((i * (~0u/255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); // count +#endif +} + + +template <typename Bitset> +struct BitsetCount +{ + typedef Bitset bitset_type; + typedef typename bitset_type::execution_space::execution_space execution_space; + typedef typename bitset_type::size_type size_type; + typedef size_type value_type; + + bitset_type m_bitset; + + BitsetCount( bitset_type const& bitset) + : m_bitset(bitset) + {} + + size_type apply() const + { + size_type count = 0u; + parallel_reduce(m_bitset.m_blocks.dimension_0(), *this, count); + return count; + } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & count) + { + count = 0u; + } + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & count, const volatile size_type & incr ) + { + count += incr; + } + + KOKKOS_INLINE_FUNCTION + void operator()( size_type i, value_type & count) const + { + count += popcount(m_bitset.m_blocks[i]); + } +}; + +}} //Kokkos::Impl + +#endif // KOKKOS_BITSET_IMPL_HPP + diff --git a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp new file mode 100755 index 0000000000000000000000000000000000000000..c87bb8a3a37cb6820d31bdd691cf447b20bbd185 --- /dev/null +++ b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp @@ -0,0 +1,195 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_FUNCTIONAL_IMPL_HPP +#define KOKKOS_FUNCTIONAL_IMPL_HPP + +#include <Kokkos_Macros.hpp> +#include <stdint.h> + +namespace Kokkos { namespace Impl { + +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. +KOKKOS_FORCEINLINE_FUNCTION +uint32_t getblock32 ( const uint8_t * p, int i ) +{ +// used to avoid aliasing error which could cause errors with +// forced inlining + return ((uint32_t)p[i*4+0]) + | ((uint32_t)p[i*4+1] << 8) + | ((uint32_t)p[i*4+2] << 16) + | ((uint32_t)p[i*4+3] << 24); +} + +KOKKOS_FORCEINLINE_FUNCTION +uint32_t rotl32 ( uint32_t x, int8_t r ) +{ return (x << r) | (x >> (32 - r)); } + +KOKKOS_FORCEINLINE_FUNCTION +uint32_t fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +KOKKOS_INLINE_FUNCTION +uint32_t MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + //---------- + // body + + for(int i=0; i<nblocks; ++i) + { + uint32_t k1 = getblock32(data,i); + + k1 *= c1; + k1 = rotl32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = rotl32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + return h1; +} + + +#if defined( __GNUC__ ) /* GNU C */ || \ + defined( __GNUG__ ) /* GNU C++ */ || \ + defined( __clang__ ) + +#define KOKKOS_MAY_ALIAS __attribute__((__may_alias__)) + +#else + +#define KOKKOS_MAY_ALIAS + +#endif + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION +bool bitwise_equal(T const * const a_ptr, T const * const b_ptr) +{ + typedef uint64_t KOKKOS_MAY_ALIAS T64; + typedef uint32_t KOKKOS_MAY_ALIAS T32; + typedef uint16_t KOKKOS_MAY_ALIAS T16; + typedef uint8_t KOKKOS_MAY_ALIAS T8; + + enum { + NUM_8 = sizeof(T), + NUM_16 = NUM_8 / 2, + NUM_32 = NUM_8 / 4, + NUM_64 = NUM_8 / 8 + }; + + union { + T const * const ptr; + T64 const * const ptr64; + T32 const * const ptr32; + T16 const * const ptr16; + T8 const * const ptr8; + } a = {a_ptr}, b = {b_ptr}; + + bool result = true; + + for (int i=0; i < NUM_64; ++i) { + result = result && a.ptr64[i] == b.ptr64[i]; + } + + if ( NUM_64*2 < NUM_32 ) { + result = result && a.ptr32[NUM_64*2] == b.ptr32[NUM_64*2]; + } + + if ( NUM_32*2 < NUM_16 ) { + result = result && a.ptr16[NUM_32*2] == b.ptr16[NUM_32*2]; + } + + if ( NUM_16*2 < NUM_8 ) { + result = result && a.ptr8[NUM_16*2] == b.ptr8[NUM_16*2]; + } + + return result; +} + + + +#undef KOKKOS_MAY_ALIAS + +}} // namespace Kokkos::Impl + +#endif //KOKKOS_FUNCTIONAL_IMPL_HPP diff --git a/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp new file mode 100755 index 0000000000000000000000000000000000000000..c52fc24359b8f7bd34489d94914ea304f7bc3425 --- /dev/null +++ b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp @@ -0,0 +1,208 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP +#define KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template< class DataType , class Arg1Type , class Arg2Type , typename SizeType > +inline +typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror +create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view , + typename Impl::enable_if< ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 ) +{ + return view ; +} + +template< class DataType , class Arg1Type , class Arg2Type , typename SizeType > +inline +typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror +create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ) +{ + // Force copy: + //typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused + typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type ; + + typename staticcrsgraph_type::HostMirror tmp ; + typename staticcrsgraph_type::row_map_type::HostMirror tmp_row_map = create_mirror( view.row_map); + + // Allocation to match: + tmp.row_map = tmp_row_map ; // Assignment of 'const' from 'non-const' + tmp.entries = create_mirror( view.entries ); + + + // Deep copy: + deep_copy( tmp_row_map , view.row_map ); + deep_copy( tmp.entries , view.entries ); + + return tmp ; +} + +template< class DataType , class Arg1Type , class Arg2Type , typename SizeType > +inline +typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror +create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view , + typename Impl::enable_if< ! ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 ) +{ + return create_mirror( view ); +} +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template< class StaticCrsGraphType , class InputSizeType > +inline +typename StaticCrsGraphType::staticcrsgraph_type +create_staticcrsgraph( const std::string & label , + const std::vector< InputSizeType > & input ) +{ + typedef StaticCrsGraphType output_type ; + //typedef std::vector< InputSizeType > input_type ; // unused + + typedef typename output_type::entries_type entries_type ; + + typedef View< typename output_type::size_type [] , + typename output_type::array_layout , + typename output_type::execution_space > work_type ; + + output_type output ; + + // Create the row map: + + const size_t length = input.size(); + + { + work_type row_work( "tmp" , length + 1 ); + + typename work_type::HostMirror row_work_host = + create_mirror_view( row_work ); + + size_t sum = 0 ; + row_work_host[0] = 0 ; + for ( size_t i = 0 ; i < length ; ++i ) { + row_work_host[i+1] = sum += input[i]; + } + + deep_copy( row_work , row_work_host ); + + output.entries = entries_type( label , sum ); + output.row_map = row_work ; + } + + return output ; +} + +//---------------------------------------------------------------------------- + +template< class StaticCrsGraphType , class InputSizeType > +inline +typename StaticCrsGraphType::staticcrsgraph_type +create_staticcrsgraph( const std::string & label , + const std::vector< std::vector< InputSizeType > > & input ) +{ + typedef StaticCrsGraphType output_type ; + typedef typename output_type::entries_type entries_type ; + + static_assert( entries_type::rank == 1 + , "Graph entries view must be rank one" ); + + typedef View< typename output_type::size_type [] , + typename output_type::array_layout , + typename output_type::execution_space > work_type ; + + output_type output ; + + // Create the row map: + + const size_t length = input.size(); + + { + work_type row_work( "tmp" , length + 1 ); + + typename work_type::HostMirror row_work_host = + create_mirror_view( row_work ); + + size_t sum = 0 ; + row_work_host[0] = 0 ; + for ( size_t i = 0 ; i < length ; ++i ) { + row_work_host[i+1] = sum += input[i].size(); + } + + deep_copy( row_work , row_work_host ); + + output.entries = entries_type( label , sum ); + output.row_map = row_work ; + } + + // Fill in the entries: + { + typename entries_type::HostMirror host_entries = + create_mirror_view( output.entries ); + + size_t sum = 0 ; + for ( size_t i = 0 ; i < length ; ++i ) { + for ( size_t j = 0 ; j < input[i].size() ; ++j , ++sum ) { + host_entries( sum ) = input[i][j] ; + } + } + + deep_copy( output.entries , host_entries ); + } + + return output ; +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */ + diff --git a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp new file mode 100755 index 0000000000000000000000000000000000000000..843fd3a8089999ab80b23506c2206e7a5de325e9 --- /dev/null +++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp @@ -0,0 +1,101 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_UnorderedMap.hpp> + +namespace Kokkos { namespace Impl { + +uint32_t find_hash_size(uint32_t size) +{ + if (size == 0u) return 0u; + + // these primes try to preserve randomness of hash + static const uint32_t primes [] = { + 3, 7, 13, 23, 53, 97, 193, 389, 769, 1543 + , 2237, 2423, 2617, 2797, 2999, 3167, 3359, 3539 + , 3727, 3911, 4441 , 4787 , 5119 , 5471 , 5801 , 6143 , 6521 , 6827 + , 7177 , 7517 , 7853 , 8887 , 9587 , 10243 , 10937 , 11617 , 12289 + , 12967 , 13649 , 14341 , 15013 , 15727 + , 17749 , 19121 , 20479 , 21859 , 23209 , 24593 , 25939 , 27329 + , 28669 , 30047 , 31469 , 35507 , 38231 , 40961 , 43711 , 46439 + , 49157 , 51893 , 54617 , 57347 , 60077 , 62801 , 70583 , 75619 + , 80669 , 85703 , 90749 , 95783 , 100823 , 105871 , 110909 , 115963 + , 120997 , 126031 , 141157 , 151237 , 161323 , 171401 , 181499 , 191579 + , 201653 , 211741 , 221813 , 231893 , 241979 , 252079 + , 282311 , 302483 , 322649 , 342803 , 362969 , 383143 , 403301 , 423457 + , 443629 , 463787 , 483953 , 504121 , 564617 , 604949 , 645313 , 685609 + , 725939 , 766273 , 806609 , 846931 , 887261 , 927587 , 967919 , 1008239 + , 1123477 , 1198397 , 1273289 , 1348177 , 1423067 , 1497983 , 1572869 + , 1647761 , 1722667 , 1797581 , 1872461 , 1947359 , 2022253 + , 2246953 , 2396759 , 2546543 , 2696363 , 2846161 , 2995973 , 3145739 + , 3295541 , 3445357 , 3595117 , 3744941 , 3894707 , 4044503 + , 4493921 , 4793501 , 5093089 , 5392679 , 5692279 , 5991883 , 6291469 + , 6591059 , 6890641 , 7190243 , 7489829 , 7789447 , 8089033 + , 8987807 , 9586981 , 10186177 , 10785371 , 11384539 , 11983729 + , 12582917 , 13182109 , 13781291 , 14380469 , 14979667 , 15578861 + , 16178053 , 17895707 , 19014187 , 20132683 , 21251141 , 22369661 + , 23488103 , 24606583 , 25725083 , 26843549 , 27962027 , 29080529 + , 30198989 , 31317469 , 32435981 , 35791397 , 38028379 , 40265327 + , 42502283 , 44739259 , 46976221 , 49213237 , 51450131 , 53687099 + , 55924061 , 58161041 , 60397993 , 62634959 , 64871921 + , 71582857 , 76056727 , 80530643 , 85004567 , 89478503 , 93952427 + , 98426347 , 102900263 , 107374217 , 111848111 , 116322053 , 120795971 + , 125269877 , 129743807 , 143165587 , 152113427 , 161061283 , 170009141 + , 178956983 , 187904819 , 196852693 , 205800547 , 214748383 , 223696237 + , 232644089 , 241591943 , 250539763 , 259487603 , 268435399 + }; + + const uint32_t num_primes = sizeof(primes)/sizeof(uint32_t); + + uint32_t hsize = primes[num_primes-1] ; + for (uint32_t i = 0; i < num_primes; ++i) { + if (size <= primes[i]) { + hsize = primes[i]; + break; + } + } + return hsize; +} + +}} // namespace Kokkos::Impl + diff --git a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp new file mode 100755 index 0000000000000000000000000000000000000000..b788c966e9c5a04d0ce4ca626190d241ec273008 --- /dev/null +++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -0,0 +1,297 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_UNORDERED_MAP_IMPL_HPP +#define KOKKOS_UNORDERED_MAP_IMPL_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <stdint.h> + +#include <cstdio> +#include <climits> +#include <iostream> +#include <iomanip> + +namespace Kokkos { namespace Impl { + +uint32_t find_hash_size( uint32_t size ); + +template <typename Map> +struct UnorderedMapRehash +{ + typedef Map map_type; + typedef typename map_type::const_map_type const_map_type; + typedef typename map_type::execution_space execution_space; + typedef typename map_type::size_type size_type; + + map_type m_dst; + const_map_type m_src; + + UnorderedMapRehash( map_type const& dst, const_map_type const& src) + : m_dst(dst), m_src(src) + {} + + void apply() const + { + parallel_for(m_src.capacity(), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i) const + { + if ( m_src.valid_at(i) ) + m_dst.insert(m_src.key_at(i), m_src.value_at(i)); + } + +}; + +template <typename UMap> +struct UnorderedMapErase +{ + typedef UMap map_type; + typedef typename map_type::execution_space execution_space; + typedef typename map_type::size_type size_type; + typedef typename map_type::key_type key_type; + typedef typename map_type::impl_value_type value_type; + + map_type m_map; + + UnorderedMapErase( map_type const& map) + : m_map(map) + {} + + void apply() const + { + parallel_for(m_map.m_hash_lists.dimension_0(), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()( size_type i ) const + { + const size_type invalid_index = map_type::invalid_index; + + size_type curr = m_map.m_hash_lists(i); + size_type next = invalid_index; + + // remove erased head of the linked-list + while (curr != invalid_index && !m_map.valid_at(curr)) { + next = m_map.m_next_index[curr]; + m_map.m_next_index[curr] = invalid_index; + m_map.m_keys[curr] = key_type(); + if (m_map.is_set) m_map.m_values[curr] = value_type(); + curr = next; + m_map.m_hash_lists(i) = next; + } + + // if the list is non-empty and the head is valid + if (curr != invalid_index && m_map.valid_at(curr) ) { + size_type prev = curr; + curr = m_map.m_next_index[prev]; + + while (curr != invalid_index) { + next = m_map.m_next_index[curr]; + if (m_map.valid_at(curr)) { + prev = curr; + } + else { + // remove curr from list + m_map.m_next_index[prev] = next; + m_map.m_next_index[curr] = invalid_index; + m_map.m_keys[curr] = key_type(); + if (map_type::is_set) m_map.m_values[curr] = value_type(); + } + curr = next; + } + } + } +}; + +template <typename UMap> +struct UnorderedMapHistogram +{ + typedef UMap map_type; + typedef typename map_type::execution_space execution_space; + typedef typename map_type::size_type size_type; + + typedef View<int[100], execution_space> histogram_view; + typedef typename histogram_view::HostMirror host_histogram_view; + + map_type m_map; + histogram_view m_length; + histogram_view m_distance; + histogram_view m_block_distance; + + UnorderedMapHistogram( map_type const& map) + : m_map(map) + , m_length("UnorderedMap Histogram") + , m_distance("UnorderedMap Histogram") + , m_block_distance("UnorderedMap Histogram") + {} + + void calculate() + { + parallel_for(m_map.m_hash_lists.dimension_0(), *this); + } + + void clear() + { + Kokkos::deep_copy(m_length, 0); + Kokkos::deep_copy(m_distance, 0); + Kokkos::deep_copy(m_block_distance, 0); + } + + void print_length(std::ostream &out) + { + host_histogram_view host_copy = create_mirror_view(m_length); + Kokkos::deep_copy(host_copy, m_length); + + for (int i=0, size = host_copy.dimension_0(); i<size; ++i) + { + out << host_copy[i] << " , "; + } + out << "\b\b\b " << std::endl; + } + + void print_distance(std::ostream &out) + { + host_histogram_view host_copy = create_mirror_view(m_distance); + Kokkos::deep_copy(host_copy, m_distance); + + for (int i=0, size = host_copy.dimension_0(); i<size; ++i) + { + out << host_copy[i] << " , "; + } + out << "\b\b\b " << std::endl; + } + + void print_block_distance(std::ostream &out) + { + host_histogram_view host_copy = create_mirror_view(m_block_distance); + Kokkos::deep_copy(host_copy, m_block_distance); + + for (int i=0, size = host_copy.dimension_0(); i<size; ++i) + { + out << host_copy[i] << " , "; + } + out << "\b\b\b " << std::endl; + } + + KOKKOS_INLINE_FUNCTION + void operator()( size_type i ) const + { + const size_type invalid_index = map_type::invalid_index; + + uint32_t length = 0; + size_type min_index = ~0u, max_index = 0; + for (size_type curr = m_map.m_hash_lists(i); curr != invalid_index; curr = m_map.m_next_index[curr]) { + ++length; + min_index = (curr < min_index) ? curr : min_index; + max_index = (max_index < curr) ? curr : max_index; + } + + size_type distance = (0u < length) ? max_index - min_index : 0u; + size_type blocks = (0u < length) ? max_index/32u - min_index/32u : 0u; + + // normalize data + length = length < 100u ? length : 99u; + distance = distance < 100u ? distance : 99u; + blocks = blocks < 100u ? blocks : 99u; + + if (0u < length) + { + atomic_fetch_add( &m_length(length), 1); + atomic_fetch_add( &m_distance(distance), 1); + atomic_fetch_add( &m_block_distance(blocks), 1); + } + } +}; + +template <typename UMap> +struct UnorderedMapPrint +{ + typedef UMap map_type; + typedef typename map_type::execution_space execution_space; + typedef typename map_type::size_type size_type; + + map_type m_map; + + UnorderedMapPrint( map_type const& map) + : m_map(map) + {} + + void apply() + { + parallel_for(m_map.m_hash_lists.dimension_0(), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()( size_type i ) const + { + const size_type invalid_index = map_type::invalid_index; + + uint32_t list = m_map.m_hash_lists(i); + for (size_type curr = list, ii=0; curr != invalid_index; curr = m_map.m_next_index[curr], ++ii) { + printf("%d[%d]: %d->%d\n", list, ii, m_map.key_at(curr), m_map.value_at(curr)); + } + } +}; + +template <typename DKey, typename DValue, typename SKey, typename SValue> +struct UnorderedMapCanAssign : public false_ {}; + +template <typename Key, typename Value> +struct UnorderedMapCanAssign<Key,Value,Key,Value> : public true_ {}; + +template <typename Key, typename Value> +struct UnorderedMapCanAssign<const Key,Value,Key,Value> : public true_ {}; + +template <typename Key, typename Value> +struct UnorderedMapCanAssign<const Key,const Value,Key,Value> : public true_ {}; + +template <typename Key, typename Value> +struct UnorderedMapCanAssign<const Key,const Value,const Key,Value> : public true_ {}; + + +}} //Kokkos::Impl + +#endif // KOKKOS_UNORDERED_MAP_IMPL_HPP diff --git a/lib/kokkos/containers/unit_tests/Makefile b/lib/kokkos/containers/unit_tests/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..176bfa906e54fe4a6212702944bc43bff36c7957 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/Makefile @@ -0,0 +1,92 @@ +KOKKOS_PATH = ../.. + +GTEST_PATH = ../../TPL/gtest + +vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests + +default: build_all + echo "End Build" + + +include $(KOKKOS_PATH)/Makefile.kokkos + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + CXX = nvcc_wrapper + CXXFLAGS ?= -O3 + LINK = $(CXX) + LDFLAGS ?= -lpthread +else + CXX ?= g++ + CXXFLAGS ?= -O3 + LINK ?= $(CXX) + LDFLAGS ?= -lpthread +endif + +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests + +TEST_TARGETS = +TARGETS = + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o + TARGETS += KokkosContainers_UnitTest_Cuda + TEST_TARGETS += test-cuda +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o + TARGETS += KokkosContainers_UnitTest_Threads + TEST_TARGETS += test-threads +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + OBJ_OPENMP = TestOpenMP.o UnitTestMain.o gtest-all.o + TARGETS += KokkosContainers_UnitTest_OpenMP + TEST_TARGETS += test-openmp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o + TARGETS += KokkosContainers_UnitTest_Serial + TEST_TARGETS += test-serial +endif + +KokkosContainers_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_Cuda + +KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_Threads + +KokkosContainers_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_OpenMP + +KokkosContainers_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_Serial + +test-cuda: KokkosContainers_UnitTest_Cuda + ./KokkosContainers_UnitTest_Cuda + +test-threads: KokkosContainers_UnitTest_Threads + ./KokkosContainers_UnitTest_Threads + +test-openmp: KokkosContainers_UnitTest_OpenMP + ./KokkosContainers_UnitTest_OpenMP + +test-serial: KokkosContainers_UnitTest_Serial + ./KokkosContainers_UnitTest_Serial + +build_all: $(TARGETS) + +test: $(TEST_TARGETS) + +clean: kokkos-clean + rm -f *.o $(TARGETS) + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< + +gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc + diff --git a/lib/kokkos/containers/unit_tests/TestBitset.hpp b/lib/kokkos/containers/unit_tests/TestBitset.hpp new file mode 100755 index 0000000000000000000000000000000000000000..76fb30edcb68aa37f7beb55352212211bcf586c3 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestBitset.hpp @@ -0,0 +1,285 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_BITSET_HPP +#define KOKKOS_TEST_BITSET_HPP + +#include <gtest/gtest.h> +#include <iostream> + + +namespace Test { + +namespace Impl { + +template <typename Bitset, bool Set> +struct TestBitset +{ + typedef Bitset bitset_type; + typedef typename bitset_type::execution_space execution_space; + typedef uint32_t value_type; + + bitset_type m_bitset; + + TestBitset( bitset_type const& bitset) + : m_bitset(bitset) + {} + + unsigned testit(unsigned collisions) + { + execution_space::fence(); + + unsigned count = 0; + Kokkos::parallel_reduce( m_bitset.size()*collisions, *this, count); + return count; + } + + + KOKKOS_INLINE_FUNCTION + void init( value_type & v ) const { v = 0; } + + KOKKOS_INLINE_FUNCTION + void join( volatile value_type & dst, const volatile value_type & src ) const + { dst += src; } + + KOKKOS_INLINE_FUNCTION + void operator()(uint32_t i, value_type & v) const + { + i = i % m_bitset.size(); + if (Set) { + if (m_bitset.set(i)) { + if (m_bitset.test(i)) ++v; + } + } + else { + if (m_bitset.reset(i)) { + if (!m_bitset.test(i)) ++v; + } + } + } + +}; + +template <typename Bitset> +struct TestBitsetTest +{ + typedef Bitset bitset_type; + typedef typename bitset_type::execution_space execution_space; + typedef uint32_t value_type; + + bitset_type m_bitset; + + TestBitsetTest( bitset_type const& bitset) + : m_bitset(bitset) + {} + + unsigned testit() + { + execution_space::fence(); + + unsigned count = 0; + Kokkos::parallel_reduce( m_bitset.size(), *this, count); + return count; + } + + + KOKKOS_INLINE_FUNCTION + void init( value_type & v ) const { v = 0; } + + KOKKOS_INLINE_FUNCTION + void join( volatile value_type & dst, const volatile value_type & src ) const + { dst += src; } + + KOKKOS_INLINE_FUNCTION + void operator()(uint32_t i, value_type & v) const + { + if (m_bitset.test( i )) ++v; + } +}; + +template <typename Bitset, bool Set> +struct TestBitsetAny +{ + typedef Bitset bitset_type; + typedef typename bitset_type::execution_space execution_space; + typedef uint32_t value_type; + + bitset_type m_bitset; + + TestBitsetAny( bitset_type const& bitset) + : m_bitset(bitset) + {} + + unsigned testit() + { + execution_space::fence(); + + unsigned count = 0; + Kokkos::parallel_reduce( m_bitset.size(), *this, count); + return count; + } + + + KOKKOS_INLINE_FUNCTION + void init( value_type & v ) const { v = 0; } + + KOKKOS_INLINE_FUNCTION + void join( volatile value_type & dst, const volatile value_type & src ) const + { dst += src; } + + KOKKOS_INLINE_FUNCTION + void operator()(uint32_t i, value_type & v) const + { + bool result = false; + unsigned attempts = 0; + uint32_t hint = (i >> 4) << 4; + while (attempts < m_bitset.max_hint()) { + if (Set) { + Kokkos::tie(result, hint) = m_bitset.find_any_unset_near(hint, i); + if (result && m_bitset.set(hint)) { + ++v; + break; + } + else if (!result) { + ++attempts; + } + } + else { + Kokkos::tie(result, hint) = m_bitset.find_any_set_near(hint, i); + if (result && m_bitset.reset(hint)) { + ++v; + break; + } + else if (!result) { + ++attempts; + } + } + } + } + +}; +} // namespace Impl + + + +template <typename Device> +void test_bitset() +{ + typedef Kokkos::Bitset< Device > bitset_type; + typedef Kokkos::ConstBitset< Device > const_bitset_type; + + //unsigned test_sizes[] = { 0u, 1000u, 1u<<14, 1u<<16, 10000001 }; + unsigned test_sizes[] = { 1000u, 1u<<14, 1u<<16, 10000001 }; + + for (int i=0, end = sizeof(test_sizes)/sizeof(unsigned); i<end; ++i) { + + //std::cout << "Bitset " << test_sizes[i] << std::endl; + + bitset_type bitset(test_sizes[i]); + + //std::cout << " Check inital count " << std::endl; + // nothing should be set + { + Impl::TestBitsetTest< bitset_type > f(bitset); + uint32_t count = f.testit(); + EXPECT_EQ(0u, count); + EXPECT_EQ(count, bitset.count()); + } + + //std::cout << " Check set() " << std::endl; + bitset.set(); + // everything should be set + { + Impl::TestBitsetTest< const_bitset_type > f(bitset); + uint32_t count = f.testit(); + EXPECT_EQ(bitset.size(), count); + EXPECT_EQ(count, bitset.count()); + } + + //std::cout << " Check reset() " << std::endl; + bitset.reset(); + EXPECT_EQ(0u, bitset.count()); + + //std::cout << " Check set(i) " << std::endl; + // test setting bits + { + Impl::TestBitset< bitset_type, true > f(bitset); + uint32_t count = f.testit(10u); + EXPECT_EQ( bitset.size(), bitset.count()); + EXPECT_EQ( bitset.size(), count ); + } + + //std::cout << " Check reset(i) " << std::endl; + // test resetting bits + { + Impl::TestBitset< bitset_type, false > f(bitset); + uint32_t count = f.testit(10u); + EXPECT_EQ( bitset.size(), count); + EXPECT_EQ( 0u, bitset.count() ); + } + + + //std::cout << " Check find_any_set(i) " << std::endl; + // test setting any bits + { + Impl::TestBitsetAny< bitset_type, true > f(bitset); + uint32_t count = f.testit(); + EXPECT_EQ( bitset.size(), bitset.count()); + EXPECT_EQ( bitset.size(), count ); + } + + //std::cout << " Check find_any_unset(i) " << std::endl; + // test resetting any bits + { + Impl::TestBitsetAny< bitset_type, false > f(bitset); + uint32_t count = f.testit(); + EXPECT_EQ( bitset.size(), count); + EXPECT_EQ( 0u, bitset.count() ); + } + + } + +} + +} // namespace Test + +#endif //KOKKOS_TEST_BITSET_HPP + diff --git a/lib/kokkos/containers/unit_tests/TestComplex.hpp b/lib/kokkos/containers/unit_tests/TestComplex.hpp new file mode 100755 index 0000000000000000000000000000000000000000..a2769fd1175d5e76b68c5a415fcce4d0573e6656 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestComplex.hpp @@ -0,0 +1,264 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + + +#ifndef KOKKOS_TEST_COMPLEX_HPP +#define KOKKOS_TEST_COMPLEX_HPP + +#include <Kokkos_Complex.hpp> +#include <gtest/gtest.h> +#include <iostream> + +namespace Test { + +namespace Impl { + template <typename RealType> + void testComplexConstructors () { + typedef Kokkos::complex<RealType> complex_type; + + complex_type z1; + complex_type z2 (0.0, 0.0); + complex_type z3 (1.0, 0.0); + complex_type z4 (0.0, 1.0); + complex_type z5 (-1.0, -2.0); + + ASSERT_TRUE( z1 == z2 ); + ASSERT_TRUE( z1 != z3 ); + ASSERT_TRUE( z1 != z4 ); + ASSERT_TRUE( z1 != z5 ); + + ASSERT_TRUE( z2 != z3 ); + ASSERT_TRUE( z2 != z4 ); + ASSERT_TRUE( z2 != z5 ); + + ASSERT_TRUE( z3 != z4 ); + ASSERT_TRUE( z3 != z5 ); + + complex_type z6 (-1.0, -2.0); + ASSERT_TRUE( z5 == z6 ); + + // Make sure that complex has value semantics, in particular, that + // equality tests use values and not pointers, so that + // reassignment actually changes the value. + z1 = complex_type (-3.0, -4.0); + ASSERT_TRUE( z1.real () == -3.0 ); + ASSERT_TRUE( z1.imag () == -4.0 ); + ASSERT_TRUE( z1 != z2 ); + + complex_type z7 (1.0); + ASSERT_TRUE( z3 == z7 ); + ASSERT_TRUE( z7 == 1.0 ); + ASSERT_TRUE( z7 != -1.0 ); + + z7 = complex_type (5.0); + ASSERT_TRUE( z7.real () == 5.0 ); + ASSERT_TRUE( z7.imag () == 0.0 ); + } + + template <typename RealType> + void testPlus () { + typedef Kokkos::complex<RealType> complex_type; + + complex_type z1 (1.0, -1.0); + complex_type z2 (-1.0, 1.0); + complex_type z3 = z1 + z2; + ASSERT_TRUE( z3 == complex_type (0.0, 0.0) ); + } + + template <typename RealType> + void testMinus () { + typedef Kokkos::complex<RealType> complex_type; + + // Test binary minus. + complex_type z1 (1.0, -1.0); + complex_type z2 (-1.0, 1.0); + complex_type z3 = z1 - z2; + ASSERT_TRUE( z3 == complex_type (2.0, -2.0) ); + + // Test unary minus. + complex_type z4 (3.0, -4.0); + ASSERT_TRUE( -z1 == complex_type (-3.0, 4.0) ); + } + + template <typename RealType> + void testTimes () { + typedef Kokkos::complex<RealType> complex_type; + + complex_type z1 (1.0, -1.0); + complex_type z2 (-1.0, 1.0); + complex_type z3 = z1 - z2; + ASSERT_TRUE( z3 == complex_type (2.0, -2.0) ); + + // Test unary minus. + complex_type z4 (3.0, -4.0); + ASSERT_TRUE( z4 == complex_type (3.0, -4.0) ); + ASSERT_TRUE( -z4 == complex_type (-3.0, 4.0) ); + ASSERT_TRUE( z4 == -complex_type (-3.0, 4.0) ); + } + + template <typename RealType> + void testDivide () { + typedef Kokkos::complex<RealType> complex_type; + + // Test division of a complex number by a real number. + complex_type z1 (1.0, -1.0); + complex_type z2 (1.0 / 2.0, -1.0 / 2.0); + ASSERT_TRUE( z1 / 2.0 == z2 ); + + // (-1+2i)/(1-i) == ((-1+2i)(1+i)) / ((1-i)(1+i)) + // (-1+2i)(1+i) == -3 + i + complex_type z3 (-1.0, 2.0); + complex_type z4 (1.0, -1.0); + complex_type z5 (-3.0, 1.0); + ASSERT_TRUE(z3 * Kokkos::conj (z4) == z5 ); + + // Test division of a complex number by a complex number. + // This assumes that RealType is a floating-point type. + complex_type z6 (Kokkos::real (z5) / 2.0, + Kokkos::imag (z5) / 2.0); + + complex_type z7 = z3 / z4; + ASSERT_TRUE( z7 == z6 ); + } + + template <typename RealType> + void testOutsideKernel () { + testComplexConstructors<RealType> (); + testPlus<RealType> (); + testTimes<RealType> (); + testDivide<RealType> (); + } + + + template<typename RealType, typename Device> + void testCreateView () { + typedef Kokkos::complex<RealType> complex_type; + Kokkos::View<complex_type*, Device> x ("x", 10); + ASSERT_TRUE( x.dimension_0 () == 10 ); + + // Test that View assignment works. + Kokkos::View<complex_type*, Device> x_nonconst = x; + Kokkos::View<const complex_type*, Device> x_const = x; + } + + template<typename RealType, typename Device> + class Fill { + public: + typedef typename Device::execution_space execution_space; + + typedef Kokkos::View<Kokkos::complex<RealType>*, Device> view_type; + typedef typename view_type::size_type size_type; + + KOKKOS_INLINE_FUNCTION + void operator () (const size_type i) const { + x_(i) = val_; + } + + Fill (const view_type& x, const Kokkos::complex<RealType>& val) : + x_ (x), val_ (val) + {} + + private: + view_type x_; + const Kokkos::complex<RealType> val_; + }; + + template<typename RealType, typename Device> + class Sum { + public: + typedef typename Device::execution_space execution_space; + + typedef Kokkos::View<const Kokkos::complex<RealType>*, Device> view_type; + typedef typename view_type::size_type size_type; + typedef Kokkos::complex<RealType> value_type; + + KOKKOS_INLINE_FUNCTION + void operator () (const size_type i, Kokkos::complex<RealType>& sum) const { + sum += x_(i); + } + + Sum (const view_type& x) : x_ (x) {} + + private: + view_type x_; + }; + + template<typename RealType, typename Device> + void testInsideKernel () { + typedef Kokkos::complex<RealType> complex_type; + typedef Kokkos::View<complex_type*, Device> view_type; + typedef typename view_type::size_type size_type; + + const size_type N = 1000; + view_type x ("x", N); + ASSERT_TRUE( x.dimension_0 () == N ); + + // Kokkos::parallel_reduce (N, [=] (const size_type i, complex_type& result) { + // result += x[i]; + // }); + + Kokkos::parallel_for (N, Fill<RealType, Device> (x, complex_type (1.0, -1.0))); + + complex_type sum; + Kokkos::parallel_reduce (N, Sum<RealType, Device> (x), sum); + + ASSERT_TRUE( sum.real () == 1000.0 && sum.imag () == -1000.0 ); + } +} // namespace Impl + + +template <typename Device> +void testComplex () +{ + Impl::testOutsideKernel<float> (); + Impl::testOutsideKernel<double> (); + + Impl::testCreateView<float, Device> (); + Impl::testCreateView<double, Device> (); + + Impl::testInsideKernel<float, Device> (); + Impl::testInsideKernel<double, Device> (); +} + + +} // namespace Test + +#endif // KOKKOS_TEST_COMPLEX_HPP diff --git a/lib/kokkos/containers/unit_tests/TestCuda.cpp b/lib/kokkos/containers/unit_tests/TestCuda.cpp new file mode 100755 index 0000000000000000000000000000000000000000..2f79205c491f22ec067b44a24f8bfc5323504e9e --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestCuda.cpp @@ -0,0 +1,206 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include <iomanip> +#include <stdint.h> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <Kokkos_Bitset.hpp> +#include <Kokkos_UnorderedMap.hpp> +#include <Kokkos_Vector.hpp> + +#include <TestBitset.hpp> +#include <TestUnorderedMap.hpp> +#include <TestStaticCrsGraph.hpp> +#include <TestVector.hpp> +#include <TestDualView.hpp> +#include <TestSegmentedView.hpp> + +//---------------------------------------------------------------------------- + + +#ifdef KOKKOS_HAVE_CUDA + +namespace Test { + +class cuda : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) ); + } + static void TearDownTestCase() + { + Kokkos::Cuda::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +}; + +TEST_F( cuda , staticcrsgraph ) +{ + TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >(); + TestStaticCrsGraph::run_test_graph2< Kokkos::Cuda >(); +} + + +void cuda_test_insert_close( uint32_t num_nodes + , uint32_t num_inserts + , uint32_t num_duplicates + ) +{ + test_insert< Kokkos::Cuda >( num_nodes, num_inserts, num_duplicates, true); +} + +void cuda_test_insert_far( uint32_t num_nodes + , uint32_t num_inserts + , uint32_t num_duplicates + ) +{ + test_insert< Kokkos::Cuda >( num_nodes, num_inserts, num_duplicates, false); +} + +void cuda_test_failed_insert( uint32_t num_nodes ) +{ + test_failed_insert< Kokkos::Cuda >( num_nodes ); +} + +void cuda_test_deep_copy( uint32_t num_nodes ) +{ + test_deep_copy< Kokkos::Cuda >( num_nodes ); +} + +void cuda_test_vector_combinations(unsigned int size) +{ + test_vector_combinations<int,Kokkos::Cuda>(size); +} + +void cuda_test_dualview_combinations(unsigned int size) +{ + test_dualview_combinations<int,Kokkos::Cuda>(size); +} + +void cuda_test_segmented_view(unsigned int size) +{ + test_segmented_view<double,Kokkos::Cuda>(size); +} + +void cuda_test_bitset() +{ + test_bitset<Kokkos::Cuda>(); +} + + + +/*TEST_F( cuda, bitset ) +{ + cuda_test_bitset(); +}*/ + +#define CUDA_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat ) \ + TEST_F( cuda, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + cuda_test_insert_##name(num_nodes,num_inserts,num_duplicates); \ + } + +#define CUDA_FAILED_INSERT_TEST( num_nodes, repeat ) \ + TEST_F( cuda, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + cuda_test_failed_insert(num_nodes); \ + } + +#define CUDA_ASSIGNEMENT_TEST( num_nodes, repeat ) \ + TEST_F( cuda, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + cuda_test_assignment_operators(num_nodes); \ + } + +#define CUDA_DEEP_COPY( num_nodes, repeat ) \ + TEST_F( cuda, UnorderedMap_deep_copy##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + cuda_test_deep_copy(num_nodes); \ + } + +#define CUDA_VECTOR_COMBINE_TEST( size ) \ + TEST_F( cuda, vector_combination##size##x) { \ + cuda_test_vector_combinations(size); \ + } + +#define CUDA_DUALVIEW_COMBINE_TEST( size ) \ + TEST_F( cuda, dualview_combination##size##x) { \ + cuda_test_dualview_combinations(size); \ + } + +#define CUDA_SEGMENTEDVIEW_TEST( size ) \ + TEST_F( cuda, segmentedview_##size##x) { \ + cuda_test_segmented_view(size); \ + } + +CUDA_DUALVIEW_COMBINE_TEST( 10 ) +CUDA_VECTOR_COMBINE_TEST( 10 ) +CUDA_VECTOR_COMBINE_TEST( 3057 ) + + +CUDA_INSERT_TEST(close, 100000, 90000, 100, 500) +CUDA_INSERT_TEST(far, 100000, 90000, 100, 500) +CUDA_DEEP_COPY( 10000, 1 ) +CUDA_FAILED_INSERT_TEST( 10000, 1000 ) +CUDA_SEGMENTEDVIEW_TEST( 200 ) + + +#undef CUDA_INSERT_TEST +#undef CUDA_FAILED_INSERT_TEST +#undef CUDA_ASSIGNEMENT_TEST +#undef CUDA_DEEP_COPY +#undef CUDA_VECTOR_COMBINE_TEST +#undef CUDA_DUALVIEW_COMBINE_TEST +#undef CUDA_SEGMENTEDVIEW_TEST +} + +#endif /* #ifdef KOKKOS_HAVE_CUDA */ + diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp new file mode 100755 index 0000000000000000000000000000000000000000..e72c69f7d41cf7d493becfcbb863e5f1d9f6679f --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp @@ -0,0 +1,121 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_DUALVIEW_HPP +#define KOKKOS_TEST_DUALVIEW_HPP + +#include <gtest/gtest.h> +#include <iostream> +#include <cstdlib> +#include <cstdio> +#include <impl/Kokkos_Timer.hpp> + +namespace Test { + +namespace Impl { + + template <typename Scalar, class Device> + struct test_dualview_combinations + { + typedef test_dualview_combinations<Scalar,Device> self_type; + + typedef Scalar scalar_type; + typedef Device execution_space; + + Scalar reference; + Scalar result; + + template <typename ViewType> + Scalar run_me(unsigned int n,unsigned int m){ + if(n<10) n = 10; + if(m<3) m = 3; + ViewType a("A",n,m); + + Kokkos::deep_copy( a.d_view , 1 ); + + a.template modify<typename ViewType::execution_space>(); + a.template sync<typename ViewType::host_mirror_space>(); + + a.h_view(5,1) = 3; + a.h_view(6,1) = 4; + a.h_view(7,2) = 5; + a.template modify<typename ViewType::host_mirror_space>(); + ViewType b = Kokkos::subview(a,std::pair<unsigned int, unsigned int>(6,9),std::pair<unsigned int, unsigned int>(0,1)); + a.template sync<typename ViewType::execution_space>(); + b.template modify<typename ViewType::execution_space>(); + + Kokkos::deep_copy( b.d_view , 2 ); + + a.template sync<typename ViewType::host_mirror_space>(); + Scalar count = 0; + for(unsigned int i = 0; i<a.d_view.dimension_0(); i++) + for(unsigned int j = 0; j<a.d_view.dimension_1(); j++) + count += a.h_view(i,j); + return count - a.d_view.dimension_0()*a.d_view.dimension_1()-2-4-3*2; + } + + + test_dualview_combinations(unsigned int size) + { + result = run_me< Kokkos::DualView<Scalar**,Kokkos::LayoutLeft,Device> >(size,3); + } + + }; + +} // namespace Impl + + + + +template <typename Scalar, typename Device> +void test_dualview_combinations(unsigned int size) +{ + Impl::test_dualview_combinations<Scalar,Device> test(size); + ASSERT_EQ( test.result,0); + +} + + +} // namespace Test + +#endif //KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp new file mode 100755 index 0000000000000000000000000000000000000000..0ff9b4f66b640b5b2bffa98a050e8bb6df33aaa3 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp @@ -0,0 +1,162 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <Kokkos_Bitset.hpp> +#include <Kokkos_UnorderedMap.hpp> +#include <Kokkos_Vector.hpp> + +//---------------------------------------------------------------------------- +#include <TestBitset.hpp> +#include <TestUnorderedMap.hpp> +#include <TestStaticCrsGraph.hpp> +#include <TestVector.hpp> +#include <TestDualView.hpp> +#include <TestSegmentedView.hpp> +#include <TestComplex.hpp> + +#include <iomanip> + +namespace Test { + +#ifdef KOKKOS_HAVE_OPENMP +class openmp : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + + unsigned threads_count = 4 ; + + if ( Kokkos::hwloc::available() ) { + threads_count = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa(); + } + + Kokkos::OpenMP::initialize( threads_count ); + } + + static void TearDownTestCase() + { + Kokkos::OpenMP::finalize(); + } +}; + +TEST_F( openmp, complex ) +{ + testComplex<Kokkos::OpenMP> (); +} + +TEST_F( openmp, bitset ) +{ + test_bitset<Kokkos::OpenMP>(); +} + +TEST_F( openmp , staticcrsgraph ) +{ + TestStaticCrsGraph::run_test_graph< Kokkos::OpenMP >(); + TestStaticCrsGraph::run_test_graph2< Kokkos::OpenMP >(); +} + +#define OPENMP_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near ) \ + TEST_F( openmp, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_insert<Kokkos::OpenMP>(num_nodes,num_inserts,num_duplicates, near); \ + } + +#define OPENMP_FAILED_INSERT_TEST( num_nodes, repeat ) \ + TEST_F( openmp, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_failed_insert<Kokkos::OpenMP>(num_nodes); \ + } + +#define OPENMP_ASSIGNEMENT_TEST( num_nodes, repeat ) \ + TEST_F( openmp, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_assignement_operators<Kokkos::OpenMP>(num_nodes); \ + } + +#define OPENMP_DEEP_COPY( num_nodes, repeat ) \ + TEST_F( openmp, UnorderedMap_deep_copy##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_deep_copy<Kokkos::OpenMP>(num_nodes); \ + } + +#define OPENMP_VECTOR_COMBINE_TEST( size ) \ + TEST_F( openmp, vector_combination##size##x) { \ + test_vector_combinations<int,Kokkos::OpenMP>(size); \ + } + +#define OPENMP_DUALVIEW_COMBINE_TEST( size ) \ + TEST_F( openmp, dualview_combination##size##x) { \ + test_dualview_combinations<int,Kokkos::OpenMP>(size); \ + } + +#define OPENMP_SEGMENTEDVIEW_TEST( size ) \ + TEST_F( openmp, segmentedview_##size##x) { \ + test_segmented_view<double,Kokkos::OpenMP>(size); \ + } + +OPENMP_INSERT_TEST(close, 100000, 90000, 100, 500, true) +OPENMP_INSERT_TEST(far, 100000, 90000, 100, 500, false) +OPENMP_FAILED_INSERT_TEST( 10000, 1000 ) +OPENMP_DEEP_COPY( 10000, 1 ) + +OPENMP_VECTOR_COMBINE_TEST( 10 ) +OPENMP_VECTOR_COMBINE_TEST( 3057 ) +OPENMP_DUALVIEW_COMBINE_TEST( 10 ) +OPENMP_SEGMENTEDVIEW_TEST( 10000 ) + +#undef OPENMP_INSERT_TEST +#undef OPENMP_FAILED_INSERT_TEST +#undef OPENMP_ASSIGNEMENT_TEST +#undef OPENMP_DEEP_COPY +#undef OPENMP_VECTOR_COMBINE_TEST +#undef OPENMP_DUALVIEW_COMBINE_TEST +#undef OPENMP_SEGMENTEDVIEW_TEST +#endif +} // namespace test + diff --git a/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp b/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp new file mode 100755 index 0000000000000000000000000000000000000000..3da4bc781bd31c23bf4b9283f343670c37d820d2 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp @@ -0,0 +1,708 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP +#define KOKKOS_TEST_SEGMENTEDVIEW_HPP + +#include <gtest/gtest.h> +#include <iostream> +#include <cstdlib> +#include <cstdio> +#include <Kokkos_Core.hpp> + +#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + +#include <Kokkos_SegmentedView.hpp> +#include <impl/Kokkos_Timer.hpp> + +namespace Test { + +namespace Impl { + + template<class ViewType , class ExecutionSpace, int Rank = ViewType::Rank> + struct GrowTest; + + template<class ViewType , class ExecutionSpace> + struct GrowTest<ViewType , ExecutionSpace , 1> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + GrowTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + a.grow(team_member , team_idx+team_member.team_size()); + value += team_idx + team_member.team_rank(); + + if((a.dimension_0()>team_idx+team_member.team_rank()) && + (a.dimension(0)>team_idx+team_member.team_rank())) + a(team_idx+team_member.team_rank()) = team_idx+team_member.team_rank(); + + } + }; + + template<class ViewType , class ExecutionSpace> + struct GrowTest<ViewType , ExecutionSpace , 2> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + GrowTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + a.grow(team_member , team_idx+ team_member.team_size()); + + for( typename ExecutionSpace::size_type k=0;k<7;k++) + value += team_idx + team_member.team_rank() + 13*k; + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) { + a(team_idx+ team_member.team_rank(),k) = + team_idx+ team_member.team_rank() + 13*k; + } + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct GrowTest<ViewType , ExecutionSpace , 3> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + GrowTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + a.grow(team_member , team_idx+ team_member.team_size()); + + for( typename ExecutionSpace::size_type k=0;k<7;k++) + for( typename ExecutionSpace::size_type l=0;l<3;l++) + value += team_idx + team_member.team_rank() + 13*k + 3*l; + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + a(team_idx+ team_member.team_rank(),k,l) = + team_idx+ team_member.team_rank() + 13*k + 3*l; + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct GrowTest<ViewType , ExecutionSpace , 4> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + GrowTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + a.grow(team_member , team_idx+ team_member.team_size()); + + for( typename ExecutionSpace::size_type k=0;k<7;k++) + for( typename ExecutionSpace::size_type l=0;l<3;l++) + for( typename ExecutionSpace::size_type m=0;m<2;m++) + value += team_idx + team_member.team_rank() + 13*k + 3*l + 7*m; + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++) + a(team_idx+ team_member.team_rank(),k,l,m) = + team_idx+ team_member.team_rank() + 13*k + 3*l + 7*m; + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct GrowTest<ViewType , ExecutionSpace , 5> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + GrowTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + a.grow(team_member , team_idx+ team_member.team_size()); + + for( typename ExecutionSpace::size_type k=0;k<7;k++) + for( typename ExecutionSpace::size_type l=0;l<3;l++) + for( typename ExecutionSpace::size_type m=0;m<2;m++) + for( typename ExecutionSpace::size_type n=0;n<3;n++) + value += + team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n; + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++) + for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++) + a(team_idx+ team_member.team_rank(),k,l,m,n) = + team_idx+ team_member.team_rank() + 13*k + 3*l + 7*m + 5*n; + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct GrowTest<ViewType , ExecutionSpace , 6> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + GrowTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + a.grow(team_member , team_idx+ team_member.team_size()); + + for( typename ExecutionSpace::size_type k=0;k<7;k++) + for( typename ExecutionSpace::size_type l=0;l<3;l++) + for( typename ExecutionSpace::size_type m=0;m<2;m++) + for( typename ExecutionSpace::size_type n=0;n<3;n++) + for( typename ExecutionSpace::size_type o=0;o<2;o++) + value += + team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o ; + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++) + for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++) + for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++) + a(team_idx+ team_member.team_rank(),k,l,m,n,o) = + team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o ; + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct GrowTest<ViewType , ExecutionSpace , 7> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + GrowTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + a.grow(team_member , team_idx+ team_member.team_size()); + + for( typename ExecutionSpace::size_type k=0;k<7;k++) + for( typename ExecutionSpace::size_type l=0;l<3;l++) + for( typename ExecutionSpace::size_type m=0;m<2;m++) + for( typename ExecutionSpace::size_type n=0;n<3;n++) + for( typename ExecutionSpace::size_type o=0;o<2;o++) + for( typename ExecutionSpace::size_type p=0;p<4;p++) + value += + team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p ; + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++) + for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++) + for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++) + for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++) + a(team_idx+ team_member.team_rank(),k,l,m,n,o,p) = + team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p ; + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct GrowTest<ViewType , ExecutionSpace , 8> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + GrowTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + a.grow(team_member , team_idx + team_member.team_size()); + + for( typename ExecutionSpace::size_type k=0;k<7;k++) + for( typename ExecutionSpace::size_type l=0;l<3;l++) + for( typename ExecutionSpace::size_type m=0;m<2;m++) + for( typename ExecutionSpace::size_type n=0;n<3;n++) + for( typename ExecutionSpace::size_type o=0;o<2;o++) + for( typename ExecutionSpace::size_type p=0;p<4;p++) + for( typename ExecutionSpace::size_type q=0;q<3;q++) + value += + team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p + 17*q; + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++) + for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++) + for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++) + for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++) + for( typename ExecutionSpace::size_type q=0;q<a.dimension_7();q++) + a(team_idx+ team_member.team_rank(),k,l,m,n,o,p,q) = + team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p + 17*q; + } + } + }; + + template<class ViewType , class ExecutionSpace, int Rank = ViewType::Rank> + struct VerifyTest; + + template<class ViewType , class ExecutionSpace> + struct VerifyTest<ViewType , ExecutionSpace , 1> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + VerifyTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + value += a(team_idx+ team_member.team_rank()); + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct VerifyTest<ViewType , ExecutionSpace , 2> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + VerifyTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + value += a(team_idx+ team_member.team_rank(),k); + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct VerifyTest<ViewType , ExecutionSpace , 3> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + VerifyTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + value += a(team_idx+ team_member.team_rank(),k,l); + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct VerifyTest<ViewType , ExecutionSpace , 4> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + VerifyTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++) + value += a(team_idx+ team_member.team_rank(),k,l,m); + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct VerifyTest<ViewType , ExecutionSpace , 5> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + VerifyTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++) + for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++) + value += a(team_idx+ team_member.team_rank(),k,l,m,n); + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct VerifyTest<ViewType , ExecutionSpace , 6> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + VerifyTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++) + for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++) + for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++) + value += a(team_idx+ team_member.team_rank(),k,l,m,n,o); + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct VerifyTest<ViewType , ExecutionSpace , 7> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + VerifyTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++) + for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++) + for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++) + for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++) + value += a(team_idx+ team_member.team_rank(),k,l,m,n,o,p); + } + } + }; + + template<class ViewType , class ExecutionSpace> + struct VerifyTest<ViewType , ExecutionSpace , 8> { + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + typedef typename Policy::member_type team_type; + typedef double value_type; + + ViewType a; + + VerifyTest(ViewType in):a(in) {} + + KOKKOS_INLINE_FUNCTION + void operator() (team_type team_member, double& value) const { + unsigned int team_idx = team_member.league_rank() * team_member.team_size(); + + if((a.dimension_0()>team_idx+ team_member.team_rank()) && + (a.dimension(0)>team_idx+ team_member.team_rank())) { + for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) + for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++) + for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++) + for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++) + for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++) + for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++) + for( typename ExecutionSpace::size_type q=0;q<a.dimension_7();q++) + value += a(team_idx+ team_member.team_rank(),k,l,m,n,o,p,q); + } + } + }; + + template <typename Scalar, class ExecutionSpace> + struct test_segmented_view + { + typedef test_segmented_view<Scalar,ExecutionSpace> self_type; + + typedef Scalar scalar_type; + typedef ExecutionSpace execution_space; + typedef Kokkos::TeamPolicy<execution_space> Policy; + + double result; + double reference; + + template <class ViewType> + void run_me(ViewType a, int max_length){ + const int team_size = Policy::team_size_max( GrowTest<ViewType,execution_space>(a) ); + const int nteams = max_length/team_size; + + reference = 0; + result = 0; + + Kokkos::parallel_reduce(Policy(nteams,team_size),GrowTest<ViewType,execution_space>(a),reference); + Kokkos::fence(); + Kokkos::parallel_reduce(Policy(nteams,team_size),VerifyTest<ViewType,execution_space>(a),result); + Kokkos::fence(); + } + + + test_segmented_view(unsigned int size,int rank) + { + reference = 0; + result = 0; + + const int dim_1 = 7; + const int dim_2 = 3; + const int dim_3 = 2; + const int dim_4 = 3; + const int dim_5 = 2; + const int dim_6 = 4; + //const int dim_7 = 3; + + if(rank==1) { + typedef Kokkos::Experimental::SegmentedView<Scalar*,Kokkos::LayoutLeft,ExecutionSpace> rank1_view; + run_me< rank1_view >(rank1_view("Rank1",128,size), size); + } + if(rank==2) { + typedef Kokkos::Experimental::SegmentedView<Scalar**,Kokkos::LayoutLeft,ExecutionSpace> rank2_view; + run_me< rank2_view >(rank2_view("Rank2",128,size,dim_1), size); + } + if(rank==3) { + typedef Kokkos::Experimental::SegmentedView<Scalar*[7][3][2],Kokkos::LayoutRight,ExecutionSpace> rank3_view; + run_me< rank3_view >(rank3_view("Rank3",128,size), size); + } + if(rank==4) { + typedef Kokkos::Experimental::SegmentedView<Scalar****,Kokkos::LayoutRight,ExecutionSpace> rank4_view; + run_me< rank4_view >(rank4_view("Rank4",128,size,dim_1,dim_2,dim_3), size); + } + if(rank==5) { + typedef Kokkos::Experimental::SegmentedView<Scalar*[7][3][2][3],Kokkos::LayoutLeft,ExecutionSpace> rank5_view; + run_me< rank5_view >(rank5_view("Rank5",128,size), size); + } + if(rank==6) { + typedef Kokkos::Experimental::SegmentedView<Scalar*****[2],Kokkos::LayoutRight,ExecutionSpace> rank6_view; + run_me< rank6_view >(rank6_view("Rank6",128,size,dim_1,dim_2,dim_3,dim_4), size); + } + if(rank==7) { + typedef Kokkos::Experimental::SegmentedView<Scalar*******,Kokkos::LayoutLeft,ExecutionSpace> rank7_view; + run_me< rank7_view >(rank7_view("Rank7",128,size,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6), size); + } + if(rank==8) { + typedef Kokkos::Experimental::SegmentedView<Scalar*****[2][4][3],Kokkos::LayoutLeft,ExecutionSpace> rank8_view; + run_me< rank8_view >(rank8_view("Rank8",128,size,dim_1,dim_2,dim_3,dim_4), size); + } + } + + }; + +} // namespace Impl + + + + +template <typename Scalar, class ExecutionSpace> +void test_segmented_view(unsigned int size) +{ + { + typedef Kokkos::Experimental::SegmentedView<Scalar*****[2][4][3],Kokkos::LayoutLeft,ExecutionSpace> view_type; + view_type a("A",128,size,7,3,2,3); + double reference; + + Impl::GrowTest<view_type,ExecutionSpace> f(a); + + const int team_size = Kokkos::TeamPolicy<ExecutionSpace>::team_size_max( f ); + const int nteams = (size+team_size-1)/team_size; + + Kokkos::parallel_reduce(Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),f,reference); + + size_t real_size = ((size+127)/128)*128; + + ASSERT_EQ(real_size,a.dimension_0()); + ASSERT_EQ(7,a.dimension_1()); + ASSERT_EQ(3,a.dimension_2()); + ASSERT_EQ(2,a.dimension_3()); + ASSERT_EQ(3,a.dimension_4()); + ASSERT_EQ(2,a.dimension_5()); + ASSERT_EQ(4,a.dimension_6()); + ASSERT_EQ(3,a.dimension_7()); + ASSERT_EQ(real_size,a.dimension(0)); + ASSERT_EQ(7,a.dimension(1)); + ASSERT_EQ(3,a.dimension(2)); + ASSERT_EQ(2,a.dimension(3)); + ASSERT_EQ(3,a.dimension(4)); + ASSERT_EQ(2,a.dimension(5)); + ASSERT_EQ(4,a.dimension(6)); + ASSERT_EQ(3,a.dimension(7)); + ASSERT_EQ(8,a.Rank); + } + { + Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,1); + ASSERT_EQ(test.reference,test.result); + } + { + Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,2); + ASSERT_EQ(test.reference,test.result); + } + { + Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,3); + ASSERT_EQ(test.reference,test.result); + } + { + Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,4); + ASSERT_EQ(test.reference,test.result); + } + { + Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,5); + ASSERT_EQ(test.reference,test.result); + } + { + Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,6); + ASSERT_EQ(test.reference,test.result); + } + { + Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,7); + ASSERT_EQ(test.reference,test.result); + } + { + Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,8); + ASSERT_EQ(test.reference,test.result); + } + +} + + +} // namespace Test + +#else + +template <typename Scalar, class ExecutionSpace> +void test_segmented_view(unsigned int ) {} + +#endif + +#endif /* #ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP */ + diff --git a/lib/kokkos/containers/unit_tests/TestSerial.cpp b/lib/kokkos/containers/unit_tests/TestSerial.cpp new file mode 100755 index 0000000000000000000000000000000000000000..6f00b113f96210299ddbd378d8bcaabed43d842d --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestSerial.cpp @@ -0,0 +1,158 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if ! defined(KOKKOS_HAVE_SERIAL) +# error "It doesn't make sense to build this file unless the Kokkos::Serial device is enabled. If you see this message, it probably means that there is an error in Kokkos' CMake build infrastructure." +#else + +#include <Kokkos_Bitset.hpp> +#include <Kokkos_UnorderedMap.hpp> +#include <Kokkos_Vector.hpp> + +#include <TestBitset.hpp> +#include <TestUnorderedMap.hpp> +#include <TestStaticCrsGraph.hpp> +#include <TestVector.hpp> +#include <TestDualView.hpp> +#include <TestSegmentedView.hpp> +#include <TestComplex.hpp> + +#include <iomanip> + +namespace Test { + +class serial : public ::testing::Test { +protected: + static void SetUpTestCase () { + std::cout << std::setprecision(5) << std::scientific; + Kokkos::Serial::initialize (); + } + + static void TearDownTestCase () { + Kokkos::Serial::finalize (); + } +}; + + +TEST_F( serial , staticcrsgraph ) +{ + TestStaticCrsGraph::run_test_graph< Kokkos::Serial >(); + TestStaticCrsGraph::run_test_graph2< Kokkos::Serial >(); +} + +TEST_F( serial, complex ) +{ + testComplex<Kokkos::Serial> (); +} + +TEST_F( serial, bitset ) +{ + test_bitset<Kokkos::Serial> (); +} + +#define SERIAL_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near ) \ + TEST_F( serial, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_insert<Kokkos::Serial> (num_nodes, num_inserts, num_duplicates, near); \ + } + +#define SERIAL_FAILED_INSERT_TEST( num_nodes, repeat ) \ + TEST_F( serial, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_failed_insert<Kokkos::Serial> (num_nodes); \ + } + +#define SERIAL_ASSIGNEMENT_TEST( num_nodes, repeat ) \ + TEST_F( serial, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_assignement_operators<Kokkos::Serial> (num_nodes); \ + } + +#define SERIAL_DEEP_COPY( num_nodes, repeat ) \ + TEST_F( serial, UnorderedMap_deep_copy##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_deep_copy<Kokkos::Serial> (num_nodes); \ + } + +#define SERIAL_VECTOR_COMBINE_TEST( size ) \ + TEST_F( serial, vector_combination##size##x) { \ + test_vector_combinations<int,Kokkos::Serial>(size); \ + } + +#define SERIAL_DUALVIEW_COMBINE_TEST( size ) \ + TEST_F( serial, dualview_combination##size##x) { \ + test_dualview_combinations<int,Kokkos::Serial>(size); \ + } + +#define SERIAL_SEGMENTEDVIEW_TEST( size ) \ + TEST_F( serial, segmentedview_##size##x) { \ + test_segmented_view<double,Kokkos::Serial>(size); \ + } + +SERIAL_INSERT_TEST(close, 100000, 90000, 100, 500, true) +SERIAL_INSERT_TEST(far, 100000, 90000, 100, 500, false) +SERIAL_FAILED_INSERT_TEST( 10000, 1000 ) +SERIAL_DEEP_COPY( 10000, 1 ) + +SERIAL_VECTOR_COMBINE_TEST( 10 ) +SERIAL_VECTOR_COMBINE_TEST( 3057 ) +SERIAL_DUALVIEW_COMBINE_TEST( 10 ) +SERIAL_SEGMENTEDVIEW_TEST( 10000 ) + +#undef SERIAL_INSERT_TEST +#undef SERIAL_FAILED_INSERT_TEST +#undef SERIAL_ASSIGNEMENT_TEST +#undef SERIAL_DEEP_COPY +#undef SERIAL_VECTOR_COMBINE_TEST +#undef SERIAL_DUALVIEW_COMBINE_TEST +#undef SERIAL_SEGMENTEDVIEW_TEST + +} // namespace test + +#endif // KOKKOS_HAVE_SERIAL + + diff --git a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp new file mode 100755 index 0000000000000000000000000000000000000000..52b45b786562efcfbaf10a4db3ac280eb644b09b --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp @@ -0,0 +1,149 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <vector> + +#include <Kokkos_StaticCrsGraph.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace TestStaticCrsGraph { + +template< class Space > +void run_test_graph() +{ + typedef Kokkos::StaticCrsGraph< unsigned , Space > dView ; + typedef typename dView::HostMirror hView ; + + const unsigned LENGTH = 1000 ; + dView dx ; + hView hx ; + + std::vector< std::vector< int > > graph( LENGTH ); + + for ( size_t i = 0 ; i < LENGTH ; ++i ) { + graph[i].reserve(8); + for ( size_t j = 0 ; j < 8 ; ++j ) { + graph[i].push_back( i + j * 3 ); + } + } + + dx = Kokkos::create_staticcrsgraph<dView>( "dx" , graph ); + hx = Kokkos::create_mirror( dx ); + + ASSERT_EQ( hx.row_map.dimension_0() - 1 , LENGTH ); + + for ( size_t i = 0 ; i < LENGTH ; ++i ) { + const size_t begin = hx.row_map[i]; + const size_t n = hx.row_map[i+1] - begin ; + ASSERT_EQ( n , graph[i].size() ); + for ( size_t j = 0 ; j < n ; ++j ) { + ASSERT_EQ( (int) hx.entries( j + begin ) , graph[i][j] ); + } + } +} + +template< class Space > +void run_test_graph2() +{ + typedef Kokkos::StaticCrsGraph< unsigned[3] , Space > dView ; + typedef typename dView::HostMirror hView ; + + const unsigned LENGTH = 10 ; + + std::vector< size_t > sizes( LENGTH ); + + size_t total_length = 0 ; + + for ( size_t i = 0 ; i < LENGTH ; ++i ) { + total_length += ( sizes[i] = 6 + i % 4 ); + } + + dView dx = Kokkos::create_staticcrsgraph<dView>( "test" , sizes ); + hView hx = Kokkos::create_mirror( dx ); + hView mx = Kokkos::create_mirror( dx ); + + ASSERT_EQ( (size_t) dx.row_map.dimension_0() , (size_t) LENGTH + 1 ); + ASSERT_EQ( (size_t) hx.row_map.dimension_0() , (size_t) LENGTH + 1 ); + ASSERT_EQ( (size_t) mx.row_map.dimension_0() , (size_t) LENGTH + 1 ); + + ASSERT_EQ( (size_t) dx.entries.dimension_0() , (size_t) total_length ); + ASSERT_EQ( (size_t) hx.entries.dimension_0() , (size_t) total_length ); + ASSERT_EQ( (size_t) mx.entries.dimension_0() , (size_t) total_length ); + + ASSERT_EQ( (size_t) dx.entries.dimension_1() , (size_t) 3 ); + ASSERT_EQ( (size_t) hx.entries.dimension_1() , (size_t) 3 ); + ASSERT_EQ( (size_t) mx.entries.dimension_1() , (size_t) 3 ); + + for ( size_t i = 0 ; i < LENGTH ; ++i ) { + const size_t entry_begin = hx.row_map[i]; + const size_t entry_end = hx.row_map[i+1]; + for ( size_t j = entry_begin ; j < entry_end ; ++j ) { + hx.entries(j,0) = j + 1 ; + hx.entries(j,1) = j + 2 ; + hx.entries(j,2) = j + 3 ; + } + } + + Kokkos::deep_copy( dx.entries , hx.entries ); + Kokkos::deep_copy( mx.entries , dx.entries ); + + ASSERT_EQ( mx.row_map.dimension_0() , (size_t) LENGTH + 1 ); + + for ( size_t i = 0 ; i < LENGTH ; ++i ) { + const size_t entry_begin = mx.row_map[i]; + const size_t entry_end = mx.row_map[i+1]; + ASSERT_EQ( ( entry_end - entry_begin ) , sizes[i] ); + for ( size_t j = entry_begin ; j < entry_end ; ++j ) { + ASSERT_EQ( (size_t) mx.entries( j , 0 ) , ( j + 1 ) ); + ASSERT_EQ( (size_t) mx.entries( j , 1 ) , ( j + 2 ) ); + ASSERT_EQ( (size_t) mx.entries( j , 2 ) , ( j + 3 ) ); + } + } +} + +} /* namespace TestStaticCrsGraph */ + + diff --git a/lib/kokkos/containers/unit_tests/TestThreads.cpp b/lib/kokkos/containers/unit_tests/TestThreads.cpp new file mode 100755 index 0000000000000000000000000000000000000000..9320a114fb858e94c8b7f60c60c322857147530f --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestThreads.cpp @@ -0,0 +1,168 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if defined( KOKKOS_HAVE_PTHREAD ) + +#include <Kokkos_Bitset.hpp> +#include <Kokkos_UnorderedMap.hpp> + +#include <Kokkos_Vector.hpp> +#include <iomanip> + + +//---------------------------------------------------------------------------- +#include <TestBitset.hpp> +#include <TestUnorderedMap.hpp> +#include <TestStaticCrsGraph.hpp> + +#include <TestVector.hpp> +#include <TestDualView.hpp> +#include <TestSegmentedView.hpp> + +namespace Test { + +class threads : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + + unsigned num_threads = 4; + + if (Kokkos::hwloc::available()) { + num_threads = Kokkos::hwloc::get_available_numa_count() + * Kokkos::hwloc::get_available_cores_per_numa() + // * Kokkos::hwloc::get_available_threads_per_core() + ; + + } + + std::cout << "Threads: " << num_threads << std::endl; + + Kokkos::Threads::initialize( num_threads ); + } + + static void TearDownTestCase() + { + Kokkos::Threads::finalize(); + } +}; + +TEST_F( threads , staticcrsgraph ) +{ + TestStaticCrsGraph::run_test_graph< Kokkos::Threads >(); + TestStaticCrsGraph::run_test_graph2< Kokkos::Threads >(); +} + +/*TEST_F( threads, bitset ) +{ + test_bitset<Kokkos::Threads>(); +}*/ + +#define THREADS_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near ) \ + TEST_F( threads, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_insert<Kokkos::Threads>(num_nodes,num_inserts,num_duplicates, near); \ + } + +#define THREADS_FAILED_INSERT_TEST( num_nodes, repeat ) \ + TEST_F( threads, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_failed_insert<Kokkos::Threads>(num_nodes); \ + } + +#define THREADS_ASSIGNEMENT_TEST( num_nodes, repeat ) \ + TEST_F( threads, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_assignement_operators<Kokkos::Threads>(num_nodes); \ + } + +#define THREADS_DEEP_COPY( num_nodes, repeat ) \ + TEST_F( threads, UnorderedMap_deep_copy##num_nodes##_##repeat##x) { \ + for (int i=0; i<repeat; ++i) \ + test_deep_copy<Kokkos::Threads>(num_nodes); \ + } + +#define THREADS_VECTOR_COMBINE_TEST( size ) \ + TEST_F( threads, vector_combination##size##x) { \ + test_vector_combinations<int,Kokkos::Threads>(size); \ + } + +#define THREADS_DUALVIEW_COMBINE_TEST( size ) \ + TEST_F( threads, dualview_combination##size##x) { \ + test_dualview_combinations<int,Kokkos::Threads>(size); \ + } + +#define THREADS_SEGMENTEDVIEW_TEST( size ) \ + TEST_F( threads, segmentedview_##size##x) { \ + test_segmented_view<double,Kokkos::Threads>(size); \ + } + + +THREADS_INSERT_TEST(far, 100000, 90000, 100, 500, false) +THREADS_FAILED_INSERT_TEST( 10000, 1000 ) +THREADS_DEEP_COPY( 10000, 1 ) + +THREADS_VECTOR_COMBINE_TEST( 10 ) +THREADS_VECTOR_COMBINE_TEST( 3057 ) +THREADS_DUALVIEW_COMBINE_TEST( 10 ) +THREADS_SEGMENTEDVIEW_TEST( 10000 ) + + +#undef THREADS_INSERT_TEST +#undef THREADS_FAILED_INSERT_TEST +#undef THREADS_ASSIGNEMENT_TEST +#undef THREADS_DEEP_COPY +#undef THREADS_VECTOR_COMBINE_TEST +#undef THREADS_DUALVIEW_COMBINE_TEST +#undef THREADS_SEGMENTEDVIEW_TEST + +} // namespace Test + + +#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */ + diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp new file mode 100755 index 0000000000000000000000000000000000000000..ff0328548dee0a3458faa82ab44a16e5a081d29b --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -0,0 +1,313 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_UNORDERED_MAP_HPP +#define KOKKOS_TEST_UNORDERED_MAP_HPP + +#include <gtest/gtest.h> +#include <iostream> + + +namespace Test { + +namespace Impl { + +template <typename MapType, bool Near = false> +struct TestInsert +{ + typedef MapType map_type; + typedef typename map_type::execution_space execution_space; + typedef uint32_t value_type; + + map_type map; + uint32_t inserts; + uint32_t collisions; + + TestInsert( map_type arg_map, uint32_t arg_inserts, uint32_t arg_collisions) + : map(arg_map) + , inserts(arg_inserts) + , collisions(arg_collisions) + {} + + void testit( bool rehash_on_fail = true ) + { + execution_space::fence(); + + uint32_t failed_count = 0; + do { + failed_count = 0; + Kokkos::parallel_reduce(inserts, *this, failed_count); + + if (rehash_on_fail && failed_count > 0u) { + const uint32_t new_capacity = map.capacity() + ((map.capacity()*3ull)/20u) + failed_count/collisions ; + map.rehash( new_capacity ); + } + } while (rehash_on_fail && failed_count > 0u); + + execution_space::fence(); + } + + + KOKKOS_INLINE_FUNCTION + void init( value_type & failed_count ) const { failed_count = 0; } + + KOKKOS_INLINE_FUNCTION + void join( volatile value_type & failed_count, const volatile value_type & count ) const + { failed_count += count; } + + KOKKOS_INLINE_FUNCTION + void operator()(uint32_t i, value_type & failed_count) const + { + const uint32_t key = Near ? i/collisions : i%(inserts/collisions); + if (map.insert(key,i).failed()) ++failed_count; + } + +}; + + template <typename MapType, bool Near> + struct TestErase + { + typedef TestErase<MapType, Near> self_type; + + typedef MapType map_type; + typedef typename MapType::execution_space execution_space; + + map_type m_map; + uint32_t m_num_erase; + uint32_t m_num_duplicates; + + TestErase(map_type map, uint32_t num_erases, uint32_t num_duplicates) + : m_map(map) + , m_num_erase(num_erases) + , m_num_duplicates(num_duplicates) + {} + + void testit() + { + execution_space::fence(); + Kokkos::parallel_for(m_num_erase, *this); + execution_space::fence(); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename execution_space::size_type i) const + { + if (Near) { + m_map.erase(i/m_num_duplicates); + } + else { + m_map.erase(i%(m_num_erase/m_num_duplicates)); + } + + } + }; + + template <typename MapType> + struct TestFind + { + typedef MapType map_type; + typedef typename MapType::execution_space::execution_space execution_space; + typedef uint32_t value_type; + + map_type m_map; + uint32_t m_num_insert; + uint32_t m_num_duplicates; + uint32_t m_max_key; + + TestFind(map_type map, uint32_t num_inserts, uint32_t num_duplicates) + : m_map(map) + , m_num_insert(num_inserts) + , m_num_duplicates(num_duplicates) + , m_max_key( ((num_inserts + num_duplicates) - 1)/num_duplicates ) + {} + + void testit(value_type &errors) + { + execution_space::execution_space::fence(); + Kokkos::parallel_reduce(m_map.capacity(), *this, errors); + execution_space::execution_space::fence(); + } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & dst) + { + dst = 0; + } + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & dst, const volatile value_type & src) + { dst += src; } + + KOKKOS_INLINE_FUNCTION + void operator()(typename execution_space::size_type i, value_type & errors) const + { + const bool expect_to_find_i = (i < m_max_key); + + const bool exists = m_map.exists(i); + + if (expect_to_find_i && !exists) ++errors; + if (!expect_to_find_i && exists) ++errors; + } + }; + +} // namespace Impl + + + +template <typename Device> +void test_insert( uint32_t num_nodes , uint32_t num_inserts , uint32_t num_duplicates , bool near ) +{ + typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type; + typedef Kokkos::UnorderedMap<const uint32_t,const uint32_t, Device> const_map_type; + + const uint32_t expected_inserts = (num_inserts + num_duplicates -1u) / num_duplicates; + + map_type map; + map.rehash(num_nodes,false); + + if (near) { + Impl::TestInsert<map_type,true> test_insert(map, num_inserts, num_duplicates); + test_insert.testit(); + } else + { + Impl::TestInsert<map_type,false> test_insert(map, num_inserts, num_duplicates); + test_insert.testit(); + } + + const bool print_list = false; + if (print_list) { + Kokkos::Impl::UnorderedMapPrint<map_type> f(map); + f.apply(); + } + + const uint32_t map_size = map.size(); + + ASSERT_FALSE( map.failed_insert()); + { + EXPECT_EQ(expected_inserts, map_size); + + { + uint32_t find_errors = 0; + Impl::TestFind<const_map_type> test_find(map, num_inserts, num_duplicates); + test_find.testit(find_errors); + EXPECT_EQ( 0u, find_errors); + } + + map.begin_erase(); + Impl::TestErase<map_type,false> test_erase(map, num_inserts, num_duplicates); + test_erase.testit(); + map.end_erase(); + EXPECT_EQ(0u, map.size()); + } +} + +template <typename Device> +void test_failed_insert( uint32_t num_nodes) +{ + typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type; + + map_type map(num_nodes); + Impl::TestInsert<map_type> test_insert(map, 2u*num_nodes, 1u); + test_insert.testit(false /*don't rehash on fail*/); + Device::execution_space::fence(); + + EXPECT_TRUE( map.failed_insert() ); +} + + + +template <typename Device> +void test_deep_copy( uint32_t num_nodes ) +{ + typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type; + typedef Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device> const_map_type; + + typedef typename map_type::HostMirror host_map_type ; + // typedef Kokkos::UnorderedMap<uint32_t, uint32_t, typename Device::host_mirror_execution_space > host_map_type; + + map_type map; + map.rehash(num_nodes,false); + + { + Impl::TestInsert<map_type> test_insert(map, num_nodes, 1); + test_insert.testit(); + ASSERT_EQ( map.size(), num_nodes); + ASSERT_FALSE( map.failed_insert() ); + { + uint32_t find_errors = 0; + Impl::TestFind<map_type> test_find(map, num_nodes, 1); + test_find.testit(find_errors); + EXPECT_EQ( find_errors, 0u); + } + + } + + host_map_type hmap; + Kokkos::deep_copy(hmap, map); + + ASSERT_EQ( map.size(), hmap.size()); + ASSERT_EQ( map.capacity(), hmap.capacity()); + { + uint32_t find_errors = 0; + Impl::TestFind<host_map_type> test_find(hmap, num_nodes, 1); + test_find.testit(find_errors); + EXPECT_EQ( find_errors, 0u); + } + + map_type mmap; + Kokkos::deep_copy(mmap, hmap); + + const_map_type cmap = mmap; + + EXPECT_EQ( cmap.size(), num_nodes); + + { + uint32_t find_errors = 0; + Impl::TestFind<const_map_type> test_find(cmap, num_nodes, 1); + test_find.testit(find_errors); + EXPECT_EQ( find_errors, 0u); + } + +} + +} // namespace Test + +#endif //KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/lib/kokkos/containers/unit_tests/TestVector.hpp b/lib/kokkos/containers/unit_tests/TestVector.hpp new file mode 100755 index 0000000000000000000000000000000000000000..f9f4564898edf32e0030d0ca135ff9f43909f397 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestVector.hpp @@ -0,0 +1,131 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_VECTOR_HPP +#define KOKKOS_TEST_VECTOR_HPP + +#include <gtest/gtest.h> +#include <iostream> +#include <cstdlib> +#include <cstdio> +#include <impl/Kokkos_Timer.hpp> + +namespace Test { + +namespace Impl { + + template <typename Scalar, class Device> + struct test_vector_combinations + { + typedef test_vector_combinations<Scalar,Device> self_type; + + typedef Scalar scalar_type; + typedef Device execution_space; + + Scalar reference; + Scalar result; + + template <typename Vector> + Scalar run_me(unsigned int n){ + Vector a(n,1); + + + a.push_back(2); + a.resize(n+4); + a[n+1] = 3; + a[n+2] = 4; + a[n+3] = 5; + + + Scalar temp1 = a[2]; + Scalar temp2 = a[n]; + Scalar temp3 = a[n+1]; + + a.assign(n+2,-1); + + a[2] = temp1; + a[n] = temp2; + a[n+1] = temp3; + + Scalar test1 = 0; + for(unsigned int i=0; i<a.size(); i++) + test1+=a[i]; + + a.assign(n+1,-2); + Scalar test2 = 0; + for(unsigned int i=0; i<a.size(); i++) + test2+=a[i]; + + a.reserve(n+10); + + Scalar test3 = 0; + for(unsigned int i=0; i<a.size(); i++) + test3+=a[i]; + + + return (test1*test2+test3)*test2+test1*test3; + } + + + test_vector_combinations(unsigned int size) + { + reference = run_me<std::vector<Scalar> >(size); + result = run_me<Kokkos::vector<Scalar,Device> >(size); + } + + }; + +} // namespace Impl + + + + +template <typename Scalar, typename Device> +void test_vector_combinations(unsigned int size) +{ + Impl::test_vector_combinations<Scalar,Device> test(size); + ASSERT_EQ( test.reference, test.result); +} + + +} // namespace Test + +#endif //KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/lib/kokkos/containers/unit_tests/UnitTestMain.cpp b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp new file mode 100755 index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp @@ -0,0 +1,50 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +int main(int argc, char *argv[]) { + ::testing::InitGoogleTest(&argc,argv); + return RUN_ALL_TESTS(); +} + diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..2bf189a22f7227084cd02fc28c0ddf591d7e8fe8 --- /dev/null +++ b/lib/kokkos/core/perf_test/Makefile @@ -0,0 +1,66 @@ +KOKKOS_PATH = ../.. + +GTEST_PATH = ../../TPL/gtest + +vpath %.cpp ${KOKKOS_PATH}/core/perf_test + +default: build_all + echo "End Build" + + +include $(KOKKOS_PATH)/Makefile.kokkos + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + CXX = nvcc_wrapper + CXXFLAGS ?= -O3 + LINK = $(CXX) + LDFLAGS ?= -lpthread +else + CXX ?= g++ + CXXFLAGS ?= -O3 + LINK ?= $(CXX) + LDFLAGS ?= -lpthread +endif + +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/perf_test + +TEST_TARGETS = +TARGETS = + +OBJ_PERF = PerfTestHost.o PerfTestCuda.o PerfTestMain.o gtest-all.o +TARGETS += KokkosCore_PerformanceTest +TEST_TARGETS += test-performance + +OBJ_ATOMICS = test_atomic.o +TARGETS += KokkosCore_PerformanceTest_Atomics +TEST_TARGETS += test-atomic + + +KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest + +KokkosCore_PerformanceTest_Atomics: $(OBJ_ATOMICS) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ATOMICS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_Atomics + +test-performance: KokkosCore_PerformanceTest + ./KokkosCore_PerformanceTest + +test-atomic: KokkosCore_PerformanceTest_Atomics + ./KokkosCore_PerformanceTest_Atomics + + +build_all: $(TARGETS) + +test: $(TEST_TARGETS) + +clean: kokkos-clean + rm -f *.o $(TARGETS) + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< + +gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc + diff --git a/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp b/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp new file mode 100755 index 0000000000000000000000000000000000000000..aa4046cbf047defd47a89141d960ad330622d9b7 --- /dev/null +++ b/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp @@ -0,0 +1,309 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_BLAS_KERNELS_HPP +#define KOKKOS_BLAS_KERNELS_HPP + +namespace Kokkos { + +template< class ConstVectorType , + class Device = typename ConstVectorType::execution_space > +struct Dot ; + +template< class ConstVectorType , + class Device = typename ConstVectorType::execution_space > +struct DotSingle ; + +template< class ConstScalarType , + class VectorType , + class Device = typename VectorType::execution_space > +struct Scale ; + +template< class ConstScalarType , + class ConstVectorType , + class VectorType , + class Device = typename VectorType::execution_space > +struct AXPBY ; + +/** \brief Y = alpha * X + beta * Y */ +template< class ConstScalarType , + class ConstVectorType , + class VectorType > +void axpby( const ConstScalarType & alpha , + const ConstVectorType & X , + const ConstScalarType & beta , + const VectorType & Y ) +{ + typedef AXPBY< ConstScalarType , ConstVectorType , VectorType > functor ; + + parallel_for( Y.dimension_0() , functor( alpha , X , beta , Y ) ); +} + +/** \brief Y *= alpha */ +template< class ConstScalarType , + class VectorType > +void scale( const ConstScalarType & alpha , const VectorType & Y ) +{ + typedef Scale< ConstScalarType , VectorType > functor ; + + parallel_for( Y.dimension_0() , functor( alpha , Y ) ); +} + +template< class ConstVectorType , + class Finalize > +void dot( const ConstVectorType & X , + const ConstVectorType & Y , + const Finalize & finalize ) +{ + typedef Dot< ConstVectorType > functor ; + + parallel_reduce( X.dimension_0() , functor( X , Y ) , finalize ); +} + +template< class ConstVectorType , + class Finalize > +void dot( const ConstVectorType & X , + const Finalize & finalize ) +{ + typedef DotSingle< ConstVectorType > functor ; + + parallel_reduce( X.dimension_0() , functor( X ) , finalize ); +} + +} /* namespace Kokkos */ + + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template< class Type , class Device > +struct Dot +{ + typedef typename Device::execution_space execution_space ; + + typedef typename + Impl::StaticAssertSame< Impl::unsigned_< 1 > , + Impl::unsigned_< Type::Rank > >::type ok_rank ; + + +/* typedef typename + Impl::StaticAssertSame< execution_space , + typename Type::execution_space >::type ok_device ;*/ + + typedef double value_type ; + +#if 1 + typename Type::const_type X ; + typename Type::const_type Y ; +#else + Type X ; + Type Y ; +#endif + + Dot( const Type & arg_x , const Type & arg_y ) + : X(arg_x) , Y(arg_y) { } + + KOKKOS_INLINE_FUNCTION + void operator()( int i , value_type & update ) const + { update += X[i] * Y[i]; } + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & update , + const volatile value_type & source ) + { update += source; } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & update ) + { update = 0 ; } +}; + +template< class Type , class Device > +struct DotSingle +{ + typedef typename Device::execution_space execution_space ; + + typedef typename + Impl::StaticAssertSame< Impl::unsigned_< 1 > , + Impl::unsigned_< Type::Rank > >::type ok_rank ; + +/* typedef typename + Impl::StaticAssertSame< execution_space , + typename Type::execution_space >::type ok_device ;*/ + + typedef double value_type ; + +#if 1 + typename Type::const_type X ; +#else + Type X ; +#endif + + DotSingle( const Type & arg_x ) : X(arg_x) {} + + KOKKOS_INLINE_FUNCTION + void operator()( int i , value_type & update ) const + { + const typename Type::value_type & x = X[i]; update += x * x ; + } + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & update , + const volatile value_type & source ) + { update += source; } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & update ) + { update = 0 ; } +}; + + +template< class ScalarType , class VectorType , class Device> +struct Scale +{ + typedef typename Device::execution_space execution_space ; + +/* typedef typename + Impl::StaticAssertSame< execution_space , + typename ScalarType::execution_space >::type + ok_scalar_device ; + + typedef typename + Impl::StaticAssertSame< execution_space , + typename VectorType::execution_space >::type + ok_vector_device ;*/ + + typedef typename + Impl::StaticAssertSame< Impl::unsigned_< 0 > , + Impl::unsigned_< ScalarType::Rank > >::type + ok_scalar_rank ; + + typedef typename + Impl::StaticAssertSame< Impl::unsigned_< 1 > , + Impl::unsigned_< VectorType::Rank > >::type + ok_vector_rank ; + +#if 1 + typename ScalarType::const_type alpha ; +#else + ScalarType alpha ; +#endif + + VectorType Y ; + + Scale( const ScalarType & arg_alpha , const VectorType & arg_Y ) + : alpha( arg_alpha ), Y( arg_Y ) {} + + KOKKOS_INLINE_FUNCTION + void operator()( int i ) const + { + Y[i] *= alpha() ; + } +}; + + +template< class ScalarType , + class ConstVectorType , + class VectorType, + class Device> +struct AXPBY +{ + typedef typename Device::execution_space execution_space ; + +/* typedef typename + Impl::StaticAssertSame< execution_space , + typename ScalarType::execution_space >::type + ok_scalar_device ; + + typedef typename + Impl::StaticAssertSame< execution_space , + typename ConstVectorType::execution_space >::type + ok_const_vector_device ; + + typedef typename + Impl::StaticAssertSame< execution_space , + typename VectorType::execution_space >::type + ok_vector_device ;*/ + + typedef typename + Impl::StaticAssertSame< Impl::unsigned_< 0 > , + Impl::unsigned_< ScalarType::Rank > >::type + ok_scalar_rank ; + + typedef typename + Impl::StaticAssertSame< Impl::unsigned_< 1 > , + Impl::unsigned_< ConstVectorType::Rank > >::type + ok_const_vector_rank ; + + typedef typename + Impl::StaticAssertSame< Impl::unsigned_< 1 > , + Impl::unsigned_< VectorType::Rank > >::type + ok_vector_rank ; + +#if 1 + typename ScalarType::const_type alpha , beta ; + typename ConstVectorType::const_type X ; +#else + ScalarType alpha , beta ; + ConstVectorType X ; +#endif + + VectorType Y ; + + AXPBY( const ScalarType & arg_alpha , + const ConstVectorType & arg_X , + const ScalarType & arg_beta , + const VectorType & arg_Y ) + : alpha( arg_alpha ), beta( arg_beta ), X( arg_X ), Y( arg_Y ) {} + + KOKKOS_INLINE_FUNCTION + void operator()( int i ) const + { + Y[i] = alpha() * X[i] + beta() * Y[i] ; + } +}; + +} /* namespace Kokkos */ + +#endif /* #ifndef KOKKOS_BLAS_KERNELS_HPP */ diff --git a/lib/kokkos/core/perf_test/PerfTestCuda.cpp b/lib/kokkos/core/perf_test/PerfTestCuda.cpp new file mode 100755 index 0000000000000000000000000000000000000000..28e654bb700cb4f6fa1b75636ab38f5c8fdf7326 --- /dev/null +++ b/lib/kokkos/core/perf_test/PerfTestCuda.cpp @@ -0,0 +1,189 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include <iomanip> +#include <algorithm> +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if defined( KOKKOS_HAVE_CUDA ) + +#include <impl/Kokkos_Timer.hpp> + +#include <PerfTestHexGrad.hpp> +#include <PerfTestBlasKernels.hpp> +#include <PerfTestGramSchmidt.hpp> +#include <PerfTestDriver.hpp> + + +namespace Test { + +class cuda : public ::testing::Test { + protected: + static void SetUpTestCase() { + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) ); + } + static void TearDownTestCase() { + Kokkos::Cuda::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +}; + +TEST_F( cuda, hexgrad ) +{ + EXPECT_NO_THROW( run_test_hexgrad< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) ); +} + +TEST_F( cuda, gramschmidt ) +{ + EXPECT_NO_THROW( run_test_gramschmidt< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) ); +} + +namespace { + +template <typename T> +struct TextureFetch +{ + typedef Kokkos::View< T *, Kokkos::CudaSpace> array_type; + typedef Kokkos::View< const T *, Kokkos::CudaSpace, Kokkos::MemoryRandomAccess> const_array_type; + typedef Kokkos::View< int *, Kokkos::CudaSpace> index_array_type; + typedef Kokkos::View< const int *, Kokkos::CudaSpace> const_index_array_type; + + struct FillArray + { + array_type m_array; + FillArray( const array_type & array ) + : m_array(array) + {} + + void apply() const + { + Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.size()), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { m_array(i) = i; } + }; + + struct RandomIndexes + { + index_array_type m_indexes; + typename index_array_type::HostMirror m_host_indexes; + RandomIndexes( const index_array_type & indexes) + : m_indexes(indexes) + , m_host_indexes(Kokkos::create_mirror(m_indexes)) + {} + + void apply() const + { + Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::HostSpace::execution_space,int>(0,m_host_indexes.size()), *this); + //random shuffle + Kokkos::HostSpace::execution_space::fence(); + std::random_shuffle(m_host_indexes.ptr_on_device(), m_host_indexes.ptr_on_device() + m_host_indexes.size()); + Kokkos::deep_copy(m_indexes,m_host_indexes); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { m_host_indexes(i) = i; } + }; + + struct RandomReduce + { + const_array_type m_array; + const_index_array_type m_indexes; + RandomReduce( const const_array_type & array, const const_index_array_type & indexes) + : m_array(array) + , m_indexes(indexes) + {} + + void apply(T & reduce) const + { + Kokkos::parallel_reduce( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.size()), *this, reduce); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i, T & reduce) const + { reduce += m_array(m_indexes(i)); } + }; + + static void run(int size, double & reduce_time, T &reduce) + { + array_type array("array",size); + index_array_type indexes("indexes",size); + + { FillArray f(array); f.apply(); } + { RandomIndexes f(indexes); f.apply(); } + + Kokkos::Cuda::fence(); + + Kokkos::Impl::Timer timer; + for (int j=0; j<10; ++j) { + RandomReduce f(array,indexes); + f.apply(reduce); + } + Kokkos::Cuda::fence(); + reduce_time = timer.seconds(); + } +}; + +} // unnamed namespace + +TEST_F( cuda, texture_double ) +{ + printf("Random reduce of double through texture fetch\n"); + for (int i=1; i<=27; ++i) { + int size = 1<<i; + double time = 0; + double reduce = 0; + TextureFetch<double>::run(size,time,reduce); + printf(" time = %1.3e size = 2^%d\n", time, i); + } +} + +} // namespace Test + +#endif /* #if defined( KOKKOS_HAVE_CUDA ) */ + diff --git a/lib/kokkos/core/perf_test/PerfTestDriver.hpp b/lib/kokkos/core/perf_test/PerfTestDriver.hpp new file mode 100755 index 0000000000000000000000000000000000000000..e3dd3b4123a2dae6fd4f69f77a046796f9c040c8 --- /dev/null +++ b/lib/kokkos/core/perf_test/PerfTestDriver.hpp @@ -0,0 +1,152 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include <string> + +// mfh 06 Jun 2013: This macro doesn't work like one might thing it +// should. It doesn't take the template parameter DeviceType and +// print its actual type name; it just literally prints out +// "DeviceType". I've worked around this below without using the +// macro, so I'm commenting out the macro to avoid compiler complaints +// about an unused macro. + +// #define KOKKOS_MACRO_IMPL_TO_STRING( X ) #X +// #define KOKKOS_MACRO_TO_STRING( X ) KOKKOS_MACRO_IMPL_TO_STRING( X ) + +//------------------------------------------------------------------------ + +namespace Test { + +enum { NUMBER_OF_TRIALS = 5 }; + + + +template< class DeviceType > +void run_test_hexgrad( int exp_beg , int exp_end, const char deviceTypeName[] ) +{ + std::string label_hexgrad ; + label_hexgrad.append( "\"HexGrad< double , " ); + // mfh 06 Jun 2013: This only appends "DeviceType" (literally) to + // the string, not the actual name of the device type. Thus, I've + // modified the function to take the name of the device type. + // + //label_hexgrad.append( KOKKOS_MACRO_TO_STRING( DeviceType ) ); + label_hexgrad.append( deviceTypeName ); + label_hexgrad.append( " >\"" ); + + for (int i = exp_beg ; i < exp_end ; ++i) { + double min_seconds = 0.0 ; + double max_seconds = 0.0 ; + double avg_seconds = 0.0 ; + + const int parallel_work_length = 1<<i; + + for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) { + const double seconds = HexGrad< DeviceType >::test(parallel_work_length) ; + + if ( 0 == j ) { + min_seconds = seconds ; + max_seconds = seconds ; + } + else { + if ( seconds < min_seconds ) min_seconds = seconds ; + if ( seconds > max_seconds ) max_seconds = seconds ; + } + avg_seconds += seconds ; + } + avg_seconds /= NUMBER_OF_TRIALS ; + + std::cout << label_hexgrad + << " , " << parallel_work_length + << " , " << min_seconds + << " , " << ( min_seconds / parallel_work_length ) + << std::endl ; + } +} + +template< class DeviceType > +void run_test_gramschmidt( int exp_beg , int exp_end, const char deviceTypeName[] ) +{ + std::string label_gramschmidt ; + label_gramschmidt.append( "\"GramSchmidt< double , " ); + // mfh 06 Jun 2013: This only appends "DeviceType" (literally) to + // the string, not the actual name of the device type. Thus, I've + // modified the function to take the name of the device type. + // + //label_gramschmidt.append( KOKKOS_MACRO_TO_STRING( DeviceType ) ); + label_gramschmidt.append( deviceTypeName ); + label_gramschmidt.append( " >\"" ); + + for (int i = exp_beg ; i < exp_end ; ++i) { + double min_seconds = 0.0 ; + double max_seconds = 0.0 ; + double avg_seconds = 0.0 ; + + const int parallel_work_length = 1<<i; + + for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) { + const double seconds = ModifiedGramSchmidt< double , DeviceType >::test(parallel_work_length, 32 ) ; + + if ( 0 == j ) { + min_seconds = seconds ; + max_seconds = seconds ; + } + else { + if ( seconds < min_seconds ) min_seconds = seconds ; + if ( seconds > max_seconds ) max_seconds = seconds ; + } + avg_seconds += seconds ; + } + avg_seconds /= NUMBER_OF_TRIALS ; + + std::cout << label_gramschmidt + << " , " << parallel_work_length + << " , " << min_seconds + << " , " << ( min_seconds / parallel_work_length ) + << std::endl ; + } +} + +} + diff --git a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp new file mode 100755 index 0000000000000000000000000000000000000000..292e09cc4a69783278d536a713e2d9df19b4d6c1 --- /dev/null +++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp @@ -0,0 +1,231 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cmath> +#include <PerfTestBlasKernels.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Test { + +// Reduction : result = dot( Q(:,j) , Q(:,j) ); +// PostProcess : R(j,j) = result ; inv = 1 / result ; +template< class VectorView , class ValueView > +struct InvNorm2 : public Kokkos::DotSingle< VectorView > { + + typedef typename Kokkos::DotSingle< VectorView >::value_type value_type ; + + ValueView Rjj ; + ValueView inv ; + + InvNorm2( const VectorView & argX , + const ValueView & argR , + const ValueView & argInv ) + : Kokkos::DotSingle< VectorView >( argX ) + , Rjj( argR ) + , inv( argInv ) + {} + + KOKKOS_INLINE_FUNCTION + void final( value_type & result ) const + { + result = sqrt( result ); + Rjj() = result ; + inv() = ( 0 < result ) ? 1.0 / result : 0 ; + } +}; + +template< class VectorView , class ValueView > +inline +void invnorm2( const VectorView & x , + const ValueView & r , + const ValueView & r_inv ) +{ + Kokkos::parallel_reduce( x.dimension_0() , InvNorm2< VectorView , ValueView >( x , r , r_inv ) ); +} + +// PostProcess : tmp = - ( R(j,k) = result ); +template< class VectorView , class ValueView > +struct DotM : public Kokkos::Dot< VectorView > { + + typedef typename Kokkos::Dot< VectorView >::value_type value_type ; + + ValueView Rjk ; + ValueView tmp ; + + DotM( const VectorView & argX , + const VectorView & argY , + const ValueView & argR , + const ValueView & argTmp ) + : Kokkos::Dot< VectorView >( argX , argY ) + , Rjk( argR ) + , tmp( argTmp ) + {} + + KOKKOS_INLINE_FUNCTION + void final( value_type & result ) const + { + Rjk() = result ; + tmp() = - result ; + } +}; + +template< class VectorView , class ValueView > +inline +void dot_neg( const VectorView & x , + const VectorView & y , + const ValueView & r , + const ValueView & r_neg ) +{ + Kokkos::parallel_reduce( x.dimension_0() , DotM< VectorView , ValueView >( x , y , r , r_neg ) ); +} + + +template< typename Scalar , class DeviceType > +struct ModifiedGramSchmidt +{ + typedef DeviceType execution_space ; + typedef typename execution_space::size_type size_type ; + + typedef Kokkos::View< Scalar** , + Kokkos::LayoutLeft , + execution_space > multivector_type ; + + typedef Kokkos::View< Scalar* , + Kokkos::LayoutLeft , + execution_space > vector_type ; + + typedef Kokkos::View< Scalar , + Kokkos::LayoutLeft , + execution_space > value_view ; + + + multivector_type Q ; + multivector_type R ; + + static double factorization( const multivector_type Q_ , + const multivector_type R_ ) + { +#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + using Kokkos::Experimental::ALL ; +#else + const Kokkos::ALL ALL ; +#endif + const size_type count = Q_.dimension_1(); + value_view tmp("tmp"); + value_view one("one"); + + Kokkos::deep_copy( one , (Scalar) 1 ); + + Kokkos::Impl::Timer timer ; + + for ( size_type j = 0 ; j < count ; ++j ) { + // Reduction : tmp = dot( Q(:,j) , Q(:,j) ); + // PostProcess : tmp = sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ; + const vector_type Qj = Kokkos::subview( Q_ , ALL , j ); + const value_view Rjj = Kokkos::subview( R_ , j , j ); + + invnorm2( Qj , Rjj , tmp ); + + // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ; + Kokkos::scale( tmp , Qj ); + + for ( size_t k = j + 1 ; k < count ; ++k ) { + const vector_type Qk = Kokkos::subview( Q_ , ALL , k ); + const value_view Rjk = Kokkos::subview( R_ , j , k ); + + // Reduction : R(j,k) = dot( Q(:,j) , Q(:,k) ); + // PostProcess : tmp = - R(j,k); + dot_neg( Qj , Qk , Rjk , tmp ); + + // Q(:,k) -= R(j,k) * Q(:,j); => Q(:,k) += tmp * Q(:,j) + Kokkos::axpby( tmp , Qj , one , Qk ); + } + } + + execution_space::fence(); + + return timer.seconds(); + } + + //-------------------------------------------------------------------------- + + static double test( const size_t length , + const size_t count , + const size_t iter = 1 ) + { + multivector_type Q_( "Q" , length , count ); + multivector_type R_( "R" , count , count ); + + typename multivector_type::HostMirror A = + Kokkos::create_mirror( Q_ ); + + // Create and fill A on the host + + for ( size_type j = 0 ; j < count ; ++j ) { + for ( size_type i = 0 ; i < length ; ++i ) { + A(i,j) = ( i + 1 ) * ( j + 1 ); + } + } + + double dt_min = 0 ; + + for ( size_t i = 0 ; i < iter ; ++i ) { + + Kokkos::deep_copy( Q_ , A ); + + // A = Q * R + + const double dt = factorization( Q_ , R_ ); + + if ( 0 == i ) dt_min = dt ; + else dt_min = dt < dt_min ? dt : dt_min ; + } + + return dt_min ; + } +}; + +} + diff --git a/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp new file mode 100755 index 0000000000000000000000000000000000000000..d13d9a49e800b8064852c174755c4eea3a94be4b --- /dev/null +++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp @@ -0,0 +1,268 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace Test { + +template< class DeviceType , + typename CoordScalarType = double , + typename GradScalarType = float > +struct HexGrad +{ + typedef DeviceType execution_space ; + typedef typename execution_space::size_type size_type ; + + typedef HexGrad<DeviceType,CoordScalarType,GradScalarType> self_type; + + // 3D array : ( ParallelWork , Space , Node ) + + enum { NSpace = 3 , NNode = 8 }; + + typedef Kokkos::View< CoordScalarType*[NSpace][NNode] , execution_space > + elem_coord_type ; + + typedef Kokkos::View< GradScalarType*[NSpace][NNode] , execution_space > + elem_grad_type ; + + elem_coord_type coords ; + elem_grad_type grad_op ; + + enum { FLOPS = 318 }; // = 3 * ( 18 + 8 * 11 ) }; + enum { READS = 18 }; + enum { WRITES = 18 }; + + HexGrad( const elem_coord_type & arg_coords , + const elem_grad_type & arg_grad_op ) + : coords( arg_coords ) + , grad_op( arg_grad_op ) + {} + + KOKKOS_INLINE_FUNCTION static + void grad( const CoordScalarType x[] , + const CoordScalarType z[] , + GradScalarType grad_y[] ) + { + const GradScalarType R42=(x[3] - x[1]); + const GradScalarType R52=(x[4] - x[1]); + const GradScalarType R54=(x[4] - x[3]); + + const GradScalarType R63=(x[5] - x[2]); + const GradScalarType R83=(x[7] - x[2]); + const GradScalarType R86=(x[7] - x[5]); + + const GradScalarType R31=(x[2] - x[0]); + const GradScalarType R61=(x[5] - x[0]); + const GradScalarType R74=(x[6] - x[3]); + + const GradScalarType R72=(x[6] - x[1]); + const GradScalarType R75=(x[6] - x[4]); + const GradScalarType R81=(x[7] - x[0]); + + const GradScalarType t1=(R63 + R54); + const GradScalarType t2=(R61 + R74); + const GradScalarType t3=(R72 + R81); + + const GradScalarType t4 =(R86 + R42); + const GradScalarType t5 =(R83 + R52); + const GradScalarType t6 =(R75 + R31); + + // Calculate Y gradient from X and Z data + + grad_y[0] = (z[1] * t1) - (z[2] * R42) - (z[3] * t5) + (z[4] * t4) + (z[5] * R52) - (z[7] * R54); + grad_y[1] = (z[2] * t2) + (z[3] * R31) - (z[0] * t1) - (z[5] * t6) + (z[6] * R63) - (z[4] * R61); + grad_y[2] = (z[3] * t3) + (z[0] * R42) - (z[1] * t2) - (z[6] * t4) + (z[7] * R74) - (z[5] * R72); + grad_y[3] = (z[0] * t5) - (z[1] * R31) - (z[2] * t3) + (z[7] * t6) + (z[4] * R81) - (z[6] * R83); + grad_y[4] = (z[5] * t3) + (z[6] * R86) - (z[7] * t2) - (z[0] * t4) - (z[3] * R81) + (z[1] * R61); + grad_y[5] = (z[6] * t5) - (z[4] * t3) - (z[7] * R75) + (z[1] * t6) - (z[0] * R52) + (z[2] * R72); + grad_y[6] = (z[7] * t1) - (z[5] * t5) - (z[4] * R86) + (z[2] * t4) - (z[1] * R63) + (z[3] * R83); + grad_y[7] = (z[4] * t2) - (z[6] * t1) + (z[5] * R75) - (z[3] * t6) - (z[2] * R74) + (z[0] * R54); + } + + KOKKOS_INLINE_FUNCTION + void operator()( size_type ielem ) const + { + GradScalarType g[NNode] ; + + const CoordScalarType x[NNode] = { + coords(ielem,0,0), + coords(ielem,0,1), + coords(ielem,0,2), + coords(ielem,0,3), + coords(ielem,0,4), + coords(ielem,0,5), + coords(ielem,0,6), + coords(ielem,0,7) + }; + + const CoordScalarType y[NNode] = { + coords(ielem,1,0), + coords(ielem,1,1), + coords(ielem,1,2), + coords(ielem,1,3), + coords(ielem,1,4), + coords(ielem,1,5), + coords(ielem,1,6), + coords(ielem,1,7) + }; + + const CoordScalarType z[NNode] = { + coords(ielem,2,0), + coords(ielem,2,1), + coords(ielem,2,2), + coords(ielem,2,3), + coords(ielem,2,4), + coords(ielem,2,5), + coords(ielem,2,6), + coords(ielem,2,7) + }; + + grad( z , y , g ); + + grad_op(ielem,0,0) = g[0]; + grad_op(ielem,0,1) = g[1]; + grad_op(ielem,0,2) = g[2]; + grad_op(ielem,0,3) = g[3]; + grad_op(ielem,0,4) = g[4]; + grad_op(ielem,0,5) = g[5]; + grad_op(ielem,0,6) = g[6]; + grad_op(ielem,0,7) = g[7]; + + grad( x , z , g ); + + grad_op(ielem,1,0) = g[0]; + grad_op(ielem,1,1) = g[1]; + grad_op(ielem,1,2) = g[2]; + grad_op(ielem,1,3) = g[3]; + grad_op(ielem,1,4) = g[4]; + grad_op(ielem,1,5) = g[5]; + grad_op(ielem,1,6) = g[6]; + grad_op(ielem,1,7) = g[7]; + + grad( y , x , g ); + + grad_op(ielem,2,0) = g[0]; + grad_op(ielem,2,1) = g[1]; + grad_op(ielem,2,2) = g[2]; + grad_op(ielem,2,3) = g[3]; + grad_op(ielem,2,4) = g[4]; + grad_op(ielem,2,5) = g[5]; + grad_op(ielem,2,6) = g[6]; + grad_op(ielem,2,7) = g[7]; + } + + //-------------------------------------------------------------------------- + + struct Init { + typedef typename self_type::execution_space execution_space ; + + elem_coord_type coords ; + + Init( const elem_coord_type & arg_coords ) + : coords( arg_coords ) {} + + KOKKOS_INLINE_FUNCTION + void operator()( size_type ielem ) const + { + coords(ielem,0,0) = 0.; + coords(ielem,1,0) = 0.; + coords(ielem,2,0) = 0.; + + coords(ielem,0,1) = 1.; + coords(ielem,1,1) = 0.; + coords(ielem,2,1) = 0.; + + coords(ielem,0,2) = 1.; + coords(ielem,1,2) = 1.; + coords(ielem,2,2) = 0.; + + coords(ielem,0,3) = 0.; + coords(ielem,1,3) = 1.; + coords(ielem,2,3) = 0.; + + + coords(ielem,0,4) = 0.; + coords(ielem,1,4) = 0.; + coords(ielem,2,4) = 1.; + + coords(ielem,0,5) = 1.; + coords(ielem,1,5) = 0.; + coords(ielem,2,5) = 1.; + + coords(ielem,0,6) = 1.; + coords(ielem,1,6) = 1.; + coords(ielem,2,6) = 1.; + + coords(ielem,0,7) = 0.; + coords(ielem,1,7) = 1.; + coords(ielem,2,7) = 1.; + } + }; + + //-------------------------------------------------------------------------- + + static double test( const int count , const int iter = 1 ) + { + elem_coord_type coord( "coord" , count ); + elem_grad_type grad ( "grad" , count ); + + // Execute the parallel kernels on the arrays: + + double dt_min = 0 ; + + Kokkos::parallel_for( count , Init( coord ) ); + execution_space::fence(); + + for ( int i = 0 ; i < iter ; ++i ) { + Kokkos::Impl::Timer timer ; + Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) ); + execution_space::fence(); + const double dt = timer.seconds(); + if ( 0 == i ) dt_min = dt ; + else dt_min = dt < dt_min ? dt : dt_min ; + } + + return dt_min ; + } +}; + +} + diff --git a/lib/kokkos/core/perf_test/PerfTestHost.cpp b/lib/kokkos/core/perf_test/PerfTestHost.cpp new file mode 100755 index 0000000000000000000000000000000000000000..6a0f2efadacd01e979d3beefd23b617b81acff48 --- /dev/null +++ b/lib/kokkos/core/perf_test/PerfTestHost.cpp @@ -0,0 +1,104 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if defined( KOKKOS_HAVE_OPENMP ) + +typedef Kokkos::OpenMP TestHostDevice ; +const char TestHostDeviceName[] = "Kokkos::OpenMP" ; + +#elif defined( KOKKOS_HAVE_PTHREAD ) + +typedef Kokkos::Threads TestHostDevice ; +const char TestHostDeviceName[] = "Kokkos::Threads" ; + +#elif defined( KOKKOS_HAVE_SERIAL ) + +typedef Kokkos::Serial TestHostDevice ; +const char TestHostDeviceName[] = "Kokkos::Serial" ; + +#else +# error "You must enable at least one of the following execution spaces in order to build this test: Kokkos::Threads, Kokkos::OpenMP, or Kokkos::Serial." +#endif + +#include <impl/Kokkos_Timer.hpp> + +#include <PerfTestHexGrad.hpp> +#include <PerfTestBlasKernels.hpp> +#include <PerfTestGramSchmidt.hpp> +#include <PerfTestDriver.hpp> + +//------------------------------------------------------------------------ + +namespace Test { + +class host : public ::testing::Test { +protected: + static void SetUpTestCase() + { + const unsigned team_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned threads_per_team = 4 ; + + TestHostDevice::initialize( team_count * threads_per_team ); + } + + static void TearDownTestCase() + { + TestHostDevice::finalize(); + } +}; + +TEST_F( host, hexgrad ) { + EXPECT_NO_THROW(run_test_hexgrad< TestHostDevice>( 10, 20, TestHostDeviceName )); +} + +TEST_F( host, gramschmidt ) { + EXPECT_NO_THROW(run_test_gramschmidt< TestHostDevice>( 10, 20, TestHostDeviceName )); +} + +} // namespace Test + + diff --git a/lib/kokkos/core/perf_test/PerfTestMain.cpp b/lib/kokkos/core/perf_test/PerfTestMain.cpp new file mode 100755 index 0000000000000000000000000000000000000000..ac916308292076fc27231968715518b3f5c02f80 --- /dev/null +++ b/lib/kokkos/core/perf_test/PerfTestMain.cpp @@ -0,0 +1,49 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +int main(int argc, char *argv[]) { + ::testing::InitGoogleTest(&argc,argv); + return RUN_ALL_TESTS(); +} diff --git a/lib/kokkos/core/perf_test/test_atomic.cpp b/lib/kokkos/core/perf_test/test_atomic.cpp new file mode 100755 index 0000000000000000000000000000000000000000..f1e5c1b6200474417bc822ed1f9b2a217de51bfd --- /dev/null +++ b/lib/kokkos/core/perf_test/test_atomic.cpp @@ -0,0 +1,504 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> +#include <cstring> +#include <cstdlib> + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Timer.hpp> + +typedef Kokkos::DefaultExecutionSpace exec_space; + +#define RESET 0 +#define BRIGHT 1 +#define DIM 2 +#define UNDERLINE 3 +#define BLINK 4 +#define REVERSE 7 +#define HIDDEN 8 + +#define BLACK 0 +#define RED 1 +#define GREEN 2 +#define YELLOW 3 +#define BLUE 4 +#define MAGENTA 5 +#define CYAN 6 +#define GREY 7 +#define WHITE 8 + +void textcolor(int attr, int fg, int bg) +{ char command[13]; + + /* Command is the control command to the terminal */ + sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40); + printf("%s", command); +} +void textcolor_standard() {textcolor(RESET, BLACK, WHITE);} + + +template<class T,class DEVICE_TYPE> +struct ZeroFunctor{ + typedef DEVICE_TYPE execution_space; + typedef typename Kokkos::View<T,execution_space> type; + typedef typename Kokkos::View<T,execution_space>::HostMirror h_type; + type data; + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + data() = 0; + } +}; + +//--------------------------------------------------- +//--------------atomic_fetch_add--------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct AddFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + Kokkos::atomic_fetch_add(&data(),(T)1); + } +}; + +template<class T> +T AddLoop(int loop) { + struct ZeroFunctor<T,exec_space> f_zero; + typename ZeroFunctor<T,exec_space>::type data("Data"); + typename ZeroFunctor<T,exec_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1,f_zero); + exec_space::fence(); + + struct AddFunctor<T,exec_space> f_add; + f_add.data = data; + Kokkos::parallel_for(loop,f_add); + exec_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T,class DEVICE_TYPE> +struct AddNonAtomicFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + data()+=(T)1; + } +}; + +template<class T> +T AddLoopNonAtomic(int loop) { + struct ZeroFunctor<T,exec_space> f_zero; + typename ZeroFunctor<T,exec_space>::type data("Data"); + typename ZeroFunctor<T,exec_space>::h_type h_data("HData"); + + f_zero.data = data; + Kokkos::parallel_for(1,f_zero); + exec_space::fence(); + + struct AddNonAtomicFunctor<T,exec_space> f_add; + f_add.data = data; + Kokkos::parallel_for(loop,f_add); + exec_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + + return val; +} + +template<class T> +T AddLoopSerial(int loop) { + T* data = new T[1]; + data[0] = 0; + + for(int i=0;i<loop;i++) + *data+=(T)1; + + T val = *data; + delete data; + return val; +} + +template<class T,class DEVICE_TYPE> +struct CASFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + T old = data(); + T newval, assumed; + do { + assumed = old; + newval = assumed + (T)1; + old = Kokkos::atomic_compare_exchange(&data(), assumed, newval); + } + while( old != assumed ); + } +}; + +template<class T> +T CASLoop(int loop) { + struct ZeroFunctor<T,exec_space> f_zero; + typename ZeroFunctor<T,exec_space>::type data("Data"); + typename ZeroFunctor<T,exec_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1,f_zero); + exec_space::fence(); + + struct CASFunctor<T,exec_space> f_cas; + f_cas.data = data; + Kokkos::parallel_for(loop,f_cas); + exec_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + + return val; +} + +template<class T,class DEVICE_TYPE> +struct CASNonAtomicFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + volatile T assumed; + volatile T newval; + bool fail=1; + do { + assumed = data(); + newval = assumed + (T)1; + if(data()==assumed) { + data() = newval; + fail = 0; + } + } + while(fail); + } +}; + +template<class T> +T CASLoopNonAtomic(int loop) { + struct ZeroFunctor<T,exec_space> f_zero; + typename ZeroFunctor<T,exec_space>::type data("Data"); + typename ZeroFunctor<T,exec_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1,f_zero); + exec_space::fence(); + + struct CASNonAtomicFunctor<T,exec_space> f_cas; + f_cas.data = data; + Kokkos::parallel_for(loop,f_cas); + exec_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + + return val; +} + +template<class T> +T CASLoopSerial(int loop) { + T* data = new T[1]; + data[0] = 0; + + for(int i=0;i<loop;i++) { + T assumed; + T newval; + T old; + do { + assumed = *data; + newval = assumed + (T)1; + old = *data; + *data = newval; + } + while(!(assumed==old)); + } + + T val = *data; + delete data; + return val; +} + +template<class T,class DEVICE_TYPE> +struct ExchFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data, data2; + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + T old = Kokkos::atomic_exchange(&data(),(T)i); + Kokkos::atomic_fetch_add(&data2(),old); + } +}; + +template<class T> +T ExchLoop(int loop) { + struct ZeroFunctor<T,exec_space> f_zero; + typename ZeroFunctor<T,exec_space>::type data("Data"); + typename ZeroFunctor<T,exec_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1,f_zero); + exec_space::fence(); + + typename ZeroFunctor<T,exec_space>::type data2("Data"); + typename ZeroFunctor<T,exec_space>::h_type h_data2("HData"); + f_zero.data = data2; + Kokkos::parallel_for(1,f_zero); + exec_space::fence(); + + struct ExchFunctor<T,exec_space> f_exch; + f_exch.data = data; + f_exch.data2 = data2; + Kokkos::parallel_for(loop,f_exch); + exec_space::fence(); + + Kokkos::deep_copy(h_data,data); + Kokkos::deep_copy(h_data2,data2); + T val = h_data() + h_data2(); + + return val; +} + +template<class T,class DEVICE_TYPE> +struct ExchNonAtomicFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data, data2; + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + T old = data(); + data()=(T) i; + data2()+=old; + } +}; + + +template<class T> +T ExchLoopNonAtomic(int loop) { + struct ZeroFunctor<T,exec_space> f_zero; + typename ZeroFunctor<T,exec_space>::type data("Data"); + typename ZeroFunctor<T,exec_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1,f_zero); + exec_space::fence(); + + typename ZeroFunctor<T,exec_space>::type data2("Data"); + typename ZeroFunctor<T,exec_space>::h_type h_data2("HData"); + f_zero.data = data2; + Kokkos::parallel_for(1,f_zero); + exec_space::fence(); + + struct ExchNonAtomicFunctor<T,exec_space> f_exch; + f_exch.data = data; + f_exch.data2 = data2; + Kokkos::parallel_for(loop,f_exch); + exec_space::fence(); + + Kokkos::deep_copy(h_data,data); + Kokkos::deep_copy(h_data2,data2); + T val = h_data() + h_data2(); + + return val; +} + +template<class T> +T ExchLoopSerial(int loop) { + T* data = new T[1]; + T* data2 = new T[1]; + data[0] = 0; + data2[0] = 0; + for(int i=0;i<loop;i++) { + T old = *data; + *data=(T) i; + *data2+=old; + } + + T val = *data2 + *data; + delete data; + delete data2; + return val; +} + +template<class T> +T LoopVariant(int loop, int test) { + switch (test) { + case 1: return AddLoop<T>(loop); + case 2: return CASLoop<T>(loop); + case 3: return ExchLoop<T>(loop); + } + return 0; +} + +template<class T> +T LoopVariantSerial(int loop, int test) { + switch (test) { + case 1: return AddLoopSerial<T>(loop); + case 2: return CASLoopSerial<T>(loop); + case 3: return ExchLoopSerial<T>(loop); + } + return 0; +} + +template<class T> +T LoopVariantNonAtomic(int loop, int test) { + switch (test) { + case 1: return AddLoopNonAtomic<T>(loop); + case 2: return CASLoopNonAtomic<T>(loop); + case 3: return ExchLoopNonAtomic<T>(loop); + } + return 0; +} + +template<class T> +void Loop(int loop, int test, const char* type_name) { + LoopVariant<T>(loop,test); + + Kokkos::Impl::Timer timer; + T res = LoopVariant<T>(loop,test); + double time1 = timer.seconds(); + + timer.reset(); + T resNonAtomic = LoopVariantNonAtomic<T>(loop,test); + double time2 = timer.seconds(); + + timer.reset(); + T resSerial = LoopVariantSerial<T>(loop,test); + double time3 = timer.seconds(); + + time1*=1e6/loop; + time2*=1e6/loop; + time3*=1e6/loop; + //textcolor_standard(); + bool passed = true; + if(resSerial!=res) passed = false; + //if(!passed) textcolor(RESET,BLACK,YELLOW); + printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T)); + //if(!passed) textcolor_standard(); + printf("\n"); +} + + +template<class T> +void Test(int loop, int test, const char* type_name) { + if(test==-1) { + Loop<T>(loop,1,type_name); + Loop<T>(loop,2,type_name); + Loop<T>(loop,3,type_name); + + } + else + Loop<T>(loop,test,type_name); +} + +int main(int argc, char* argv[]) +{ + int type = -1; + int loop = 1000000; + int test = -1; + + for(int i=0;i<argc;i++) + { + if((strcmp(argv[i],"--test")==0)) {test=atoi(argv[++i]); continue;} + if((strcmp(argv[i],"--type")==0)) {type=atoi(argv[++i]); continue;} + if((strcmp(argv[i],"-l")==0)||(strcmp(argv[i],"--loop")==0)) {loop=atoi(argv[++i]); continue;} + } + + + Kokkos::initialize(argc,argv); + + + printf("Using %s\n",Kokkos::atomic_query_version()); + bool all_tests = false; + if(type==-1) all_tests = true; + while(type<100) { + if(type==1) { + Test<int>(loop,test,"int "); + } + if(type==2) { + Test<long int>(loop,test,"long int "); + } + if(type==3) { + Test<long long int>(loop,test,"long long int "); + } + if(type==4) { + Test<unsigned int>(loop,test,"unsigned int "); + } + if(type==5) { + Test<unsigned long int>(loop,test,"unsigned long int "); + } + if(type==6) { + Test<unsigned long long int>(loop,test,"unsigned long long int "); + } + if(type==10) { + //Test<float>(loop,test,"float "); + } + if(type==11) { + Test<double>(loop,test,"double "); + } + if(!all_tests) type=100; + else type++; + } + + Kokkos::finalize(); + +} + diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp new file mode 100755 index 0000000000000000000000000000000000000000..37c5e53e58e901a3519a5c60bdaf5aec001c80e6 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp @@ -0,0 +1,283 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP +#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP + +/* only compile this file if CUDA is enabled for Kokkos */ +#if defined( KOKKOS_HAVE_CUDA ) + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +//---------------------------------------------------------------------------- +// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4) +// Via reinterpret_case this can be used to support all scalar types of those sizes. +// Any other scalar type falls back to either normal reads out of global memory, +// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0) + +template< typename ValueType , typename AliasType > +struct CudaTextureFetch { + + ::cudaTextureObject_t m_obj ; + const ValueType * m_ptr ; + int m_offset ; + + // Deference operator pulls through texture object and returns by value + template< typename iType > + KOKKOS_INLINE_FUNCTION + ValueType operator[]( const iType & i ) const + { +#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) + AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset ); + return *(reinterpret_cast<ValueType*> (&v)); +#else + return m_ptr[ i ]; +#endif + } + + // Pointer to referenced memory + KOKKOS_INLINE_FUNCTION + operator const ValueType * () const { return m_ptr ; } + + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {} + + KOKKOS_INLINE_FUNCTION + ~CudaTextureFetch() {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch( const CudaTextureFetch & rhs ) + : m_obj( rhs.m_obj ) + , m_ptr( rhs.m_ptr ) + , m_offset( rhs.m_offset ) + {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch( CudaTextureFetch && rhs ) + : m_obj( rhs.m_obj ) + , m_ptr( rhs.m_ptr ) + , m_offset( rhs.m_offset ) + {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) + { + m_obj = rhs.m_obj ; + m_ptr = rhs.m_ptr ; + m_offset = rhs.m_offset ; + return *this ; + } + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch & operator = ( CudaTextureFetch && rhs ) + { + m_obj = rhs.m_obj ; + m_ptr = rhs.m_ptr ; + m_offset = rhs.m_offset ; + return *this ; + } + + // Texture object spans the entire allocation. + // This handle may view a subset of the allocation, so an offset is required. + template< class CudaMemorySpace > + inline explicit + CudaTextureFetch( const ValueType * const arg_ptr + , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record + ) + // 'attach_texture_object' returns 0 when __CUDA_ARCH__ < 300 + : m_obj( record.template attach_texture_object< AliasType >() ) + , m_ptr( arg_ptr ) + , m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) ) + {} +}; + +#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC ) + +template< typename ValueType , typename AliasType > +struct CudaLDGFetch { + + const ValueType * m_ptr ; + + template< typename iType > + KOKKOS_INLINE_FUNCTION + ValueType operator[]( const iType & i ) const + { + AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_ptr[i])); + return *(reinterpret_cast<ValueType*> (&v)); + } + + KOKKOS_INLINE_FUNCTION + operator const ValueType * () const { return m_ptr ; } + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch() : m_ptr() {} + + KOKKOS_INLINE_FUNCTION + ~CudaLDGFetch() {} + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch( const CudaLDGFetch & rhs ) + : m_ptr( rhs.m_ptr ) + {} + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch( CudaLDGFetch && rhs ) + : m_ptr( rhs.m_ptr ) + {} + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch & operator = ( const CudaLDGFetch & rhs ) + { + m_ptr = rhs.m_ptr ; + return *this ; + } + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch & operator = ( CudaLDGFetch && rhs ) + { + m_ptr = rhs.m_ptr ; + return *this ; + } + + template< class CudaMemorySpace > + inline explicit + CudaTextureFetch( const ValueType * const arg_ptr + , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const & + ) + : m_ptr( arg_data_ptr ) + {} +}; + +#endif + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization + * if 'const' value type, CudaSpace and random access. + */ +template< class Traits > +class ViewDataHandle< Traits , + typename std::enable_if<( + // Is Cuda memory space + ( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value || + std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value ) + && + // Is a trivial const value of 4, 8, or 16 bytes + std::is_trivial<typename Traits::const_value_type>::value + && + std::is_same<typename Traits::const_value_type,typename Traits::value_type>::value + && + ( sizeof(typename Traits::const_value_type) == 4 || + sizeof(typename Traits::const_value_type) == 8 || + sizeof(typename Traits::const_value_type) == 16 ) + && + // Random access trait + ( Traits::memory_traits::RandomAccess != 0 ) + )>::type > +{ +public: + + using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ; + + using value_type = typename Traits::const_value_type ; + using return_type = typename Traits::const_value_type ; // NOT a reference + + using alias_type = typename std::conditional< ( sizeof(value_type) == 4 ) , int , + typename std::conditional< ( sizeof(value_type) == 8 ) , ::int2 , + typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void + >::type + >::type + >::type ; + +#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC ) + using handle_type = Kokkos::Experimental::Impl::CudaLDGFetch< value_type , alias_type > ; +#else + using handle_type = Kokkos::Experimental::Impl::CudaTextureFetch< value_type , alias_type > ; +#endif + + KOKKOS_INLINE_FUNCTION + static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ ) + { + return arg_handle ; + } + + KOKKOS_INLINE_FUNCTION + static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker ) + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + // Assignment of texture = non-texture requires creation of a texture object + // which can only occur on the host. In addition, 'get_record' is only valid + // if called in a host execution space + return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() ); +#else + Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel"); + return handle_type(); +#endif + } +}; + +} +} +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_HAVE_CUDA ) */ +#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */ + diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp new file mode 100755 index 0000000000000000000000000000000000000000..c1b2d51c477e8f99dad975f4f33757f8af04393a --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp @@ -0,0 +1,277 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDAEXEC_HPP +#define KOKKOS_CUDAEXEC_HPP + +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#ifdef KOKKOS_HAVE_CUDA + +#include <string> +#include <Kokkos_Parallel.hpp> +#include <impl/Kokkos_Error.hpp> +#include <Cuda/Kokkos_Cuda_abort.hpp> +#include <Cuda/Kokkos_Cuda_Error.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +struct CudaTraits { + enum { WarpSize = 32 /* 0x0020 */ }; + enum { WarpIndexMask = 0x001f /* Mask for warpindex */ }; + enum { WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ }; + + enum { SharedMemoryBanks = 32 /* Compute device 2.0 */ }; + enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ }; + enum { SharedMemoryUsage = 0x04000 /* 16k shared / 48k L1 Cache */ }; + + enum { UpperBoundGridCount = 65535 /* Hard upper bound */ }; + enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ }; + enum { ConstantMemoryUsage = 0x008000 /* 32k bytes */ }; + enum { ConstantMemoryCache = 0x002000 /* 8k bytes */ }; + + typedef unsigned long + ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ]; + + enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ }; + + KOKKOS_INLINE_FUNCTION static + CudaSpace::size_type warp_count( CudaSpace::size_type i ) + { return ( i + WarpIndexMask ) >> WarpIndexShift ; } + + KOKKOS_INLINE_FUNCTION static + CudaSpace::size_type warp_align( CudaSpace::size_type i ) + { + enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) }; + return ( i + WarpIndexMask ) & Mask ; + } +}; + +//---------------------------------------------------------------------------- + +CudaSpace::size_type cuda_internal_maximum_warp_count(); +CudaSpace::size_type cuda_internal_maximum_grid_count(); +CudaSpace::size_type cuda_internal_maximum_shared_words(); + +CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size ); +CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size ); +CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size ); + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#if defined( __CUDACC__ ) + +/** \brief Access to constant memory on the device */ +#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE +extern +#endif +__device__ __constant__ +Kokkos::Impl::CudaTraits::ConstantGlobalBufferType +kokkos_impl_cuda_constant_memory_buffer ; + +__device__ __constant__ +int* kokkos_impl_cuda_atomic_lock_array ; +#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF +#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39 + +namespace Kokkos { +namespace Impl { +__device__ inline +bool lock_address_cuda_space(void* ptr) { + size_t offset = size_t(ptr); + offset = offset >> 2; + offset = offset & CUDA_SPACE_ATOMIC_MASK; + //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK; + return (0 == atomicCAS(&kokkos_impl_cuda_atomic_lock_array[offset],0,1)); +} + +__device__ inline +void unlock_address_cuda_space(void* ptr) { + size_t offset = size_t(ptr); + offset = offset >> 2; + offset = offset & CUDA_SPACE_ATOMIC_MASK; + //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK; + atomicExch( &kokkos_impl_cuda_atomic_lock_array[ offset ], 0); +} + +} +} + +template< typename T > +inline +__device__ +T * kokkos_impl_cuda_shared_memory() +{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; } + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +// See section B.17 of Cuda C Programming Guide Version 3.2 +// for discussion of +// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) +// function qualifier which could be used to improve performance. +//---------------------------------------------------------------------------- +// Maximize L1 cache and minimize shared memory: +// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 ); +// For 2.0 capability: 48 KB L1 and 16 KB shared +//---------------------------------------------------------------------------- + +template< class DriverType > +__global__ +static void cuda_parallel_launch_constant_memory() +{ + const DriverType & driver = + *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer ); + + driver(); +} + +template< class DriverType > +__global__ +static void cuda_parallel_launch_local_memory( const DriverType driver ) +{ + driver(); +} + +template < class DriverType , + bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) > +struct CudaParallelLaunch ; + +template < class DriverType > +struct CudaParallelLaunch< DriverType , true > { + + inline + CudaParallelLaunch( const DriverType & driver + , const dim3 & grid + , const dim3 & block + , const int shmem + , const cudaStream_t stream = 0 ) + { + if ( grid.x && ( block.x * block.y * block.z ) ) { + + if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) < + sizeof( DriverType ) ) { + Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") ); + } + + if ( CudaTraits::SharedMemoryCapacity < shmem ) { + Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") ); + } + else if ( shmem ) { + cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ); + } else { + cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ); + } + + // Copy functor to constant memory on the device + cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) ); + + int* lock_array_ptr = lock_array_cuda_space_ptr(); + cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) ); + + // Invoke the driver function on the device + cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>(); + +#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) + Kokkos::Cuda::fence(); + CUDA_SAFE_CALL( cudaGetLastError() ); +#endif + } + } +}; + +template < class DriverType > +struct CudaParallelLaunch< DriverType , false > { + + inline + CudaParallelLaunch( const DriverType & driver + , const dim3 & grid + , const dim3 & block + , const int shmem + , const cudaStream_t stream = 0 ) + { + if ( grid.x && ( block.x * block.y * block.z ) ) { + + if ( CudaTraits::SharedMemoryCapacity < shmem ) { + Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") ); + } + else if ( shmem ) { + cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ); + } else { + cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ); + } + + int* lock_array_ptr = lock_array_cuda_space_ptr(); + cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) ); + + cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver ); + +#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) + Kokkos::Cuda::fence(); + CUDA_SAFE_CALL( cudaGetLastError() ); +#endif + } + } +}; + +//---------------------------------------------------------------------------- + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* defined( __CUDACC__ ) */ +#endif /* defined( KOKKOS_HAVE_CUDA ) */ +#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp new file mode 100755 index 0000000000000000000000000000000000000000..5b397845c351887cbcc80f9abf31ba2d2615dedc --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -0,0 +1,670 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdlib.h> +#include <iostream> +#include <sstream> +#include <stdexcept> +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#ifdef KOKKOS_HAVE_CUDA + +#include <Kokkos_Cuda.hpp> +#include <Kokkos_CudaSpace.hpp> + +#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp> +#include <Cuda/Kokkos_Cuda_Internal.hpp> +#include <impl/Kokkos_Error.hpp> + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +DeepCopy<CudaSpace,CudaSpace>::DeepCopy( void * dst , const void * src , size_t n ) +{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); } + +DeepCopy<CudaSpace,CudaSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n ) +{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); } + +DeepCopy<HostSpace,CudaSpace>::DeepCopy( void * dst , const void * src , size_t n ) +{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); } + +DeepCopy<HostSpace,CudaSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n ) +{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); } + +DeepCopy<CudaSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n ) +{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); } + +DeepCopy<CudaSpace,HostSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n ) +{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); } + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + + +namespace Kokkos { + +namespace { + +void texture_object_attach_impl( Impl::AllocationTracker const & tracker + , unsigned type_size + , ::cudaChannelFormatDesc const & desc + ) +{ + enum { TEXTURE_BOUND_1D = 2u << 27 }; + + if ( tracker.attribute() == NULL ) { + // check for correct allocator + const bool ok_alloc = tracker.allocator()->support_texture_binding(); + + const bool ok_count = (tracker.alloc_size() / type_size) < TEXTURE_BOUND_1D; + + if (ok_alloc && ok_count) { + Impl::TextureAttribute * attr = new Impl::TextureAttribute( tracker.alloc_ptr(), tracker.alloc_size(), desc ); + tracker.set_attribute( attr ); + } + else { + std::ostringstream oss; + oss << "Error: Cannot attach texture object"; + if (!ok_alloc) { + oss << ", incompatabile allocator " << tracker.allocator()->name(); + } + if (!ok_count) { + oss << ", array " << tracker.label() << " too large"; + } + oss << "."; + Kokkos::Impl::throw_runtime_exception( oss.str() ); + } + } + + if ( NULL == dynamic_cast<Impl::TextureAttribute *>(tracker.attribute()) ) { + std::ostringstream oss; + oss << "Error: Allocation " << tracker.label() << " already has an attribute attached."; + Kokkos::Impl::throw_runtime_exception( oss.str() ); + } + +} + +} // unnamed namespace + +/*--------------------------------------------------------------------------*/ + +Impl::AllocationTracker CudaSpace::allocate_and_track( const std::string & label, const size_t size ) +{ + return Impl::AllocationTracker( allocator(), size, label); +} + +void CudaSpace::texture_object_attach( Impl::AllocationTracker const & tracker + , unsigned type_size + , ::cudaChannelFormatDesc const & desc + ) +{ + texture_object_attach_impl( tracker, type_size, desc ); +} + +void CudaSpace::access_error() +{ + const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" ); + Kokkos::Impl::throw_runtime_exception( msg ); +} + +void CudaSpace::access_error( const void * const ) +{ + const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" ); + Kokkos::Impl::throw_runtime_exception( msg ); +} + +/*--------------------------------------------------------------------------*/ + +Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size ) +{ + return Impl::AllocationTracker( allocator(), size, label); +} + +void CudaUVMSpace::texture_object_attach( Impl::AllocationTracker const & tracker + , unsigned type_size + , ::cudaChannelFormatDesc const & desc + ) +{ + texture_object_attach_impl( tracker, type_size, desc ); +} + +bool CudaUVMSpace::available() +{ +#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__) + enum { UVM_available = true }; +#else + enum { UVM_available = false }; +#endif + return UVM_available; +} + +/*--------------------------------------------------------------------------*/ + +Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size ) +{ + return Impl::AllocationTracker( allocator(), size, label); +} + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +CudaSpace::CudaSpace() + : m_device( Kokkos::Cuda().cuda_device() ) +{ +} + +CudaUVMSpace::CudaUVMSpace() + : m_device( Kokkos::Cuda().cuda_device() ) +{ +} + +CudaHostPinnedSpace::CudaHostPinnedSpace() +{ +} + +void * CudaSpace::allocate( const size_t arg_alloc_size ) const +{ + void * ptr = NULL; + + CUDA_SAFE_CALL( cudaMalloc( &ptr, arg_alloc_size ) ); + + return ptr ; +} + +void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const +{ + void * ptr = NULL; + + CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) ); + + return ptr ; +} + +void * CudaHostPinnedSpace::allocate( const size_t arg_alloc_size ) const +{ + void * ptr = NULL; + + CUDA_SAFE_CALL( cudaHostAlloc( &ptr, arg_alloc_size , cudaHostAllocDefault ) ); + + return ptr ; +} + +void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const +{ + try { + CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) ); + } catch(...) {} +} + +void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const +{ + try { + CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) ); + } catch(...) {} +} + +void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const +{ + try { + CUDA_SAFE_CALL( cudaFreeHost( arg_alloc_ptr ) ); + } catch(...) {} +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +SharedAllocationRecord< void , void > +SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record ; + +SharedAllocationRecord< void , void > +SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record ; + +SharedAllocationRecord< void , void > +SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record ; + +::cudaTextureObject_t +SharedAllocationRecord< Kokkos::CudaSpace , void >:: +attach_texture_object( const unsigned sizeof_alias + , void * const alloc_ptr + , size_t const alloc_size ) +{ + // Only valid for 300 <= __CUDA_ARCH__ + // otherwise return zero. + + ::cudaTextureObject_t tex_obj ; + + struct cudaResourceDesc resDesc ; + struct cudaTextureDesc texDesc ; + + memset( & resDesc , 0 , sizeof(resDesc) ); + memset( & texDesc , 0 , sizeof(texDesc) ); + + resDesc.resType = cudaResourceTypeLinear ; + resDesc.res.linear.desc = ( sizeof_alias == 4 ? cudaCreateChannelDesc< int >() : + ( sizeof_alias == 8 ? cudaCreateChannelDesc< ::int2 >() : + /* sizeof_alias == 16 */ cudaCreateChannelDesc< ::int4 >() ) ); + resDesc.res.linear.sizeInBytes = alloc_size ; + resDesc.res.linear.devPtr = alloc_ptr ; + + CUDA_SAFE_CALL( cudaCreateTextureObject( & tex_obj , & resDesc, & texDesc, NULL ) ); + + return tex_obj ; +} + +std::string +SharedAllocationRecord< Kokkos::CudaSpace , void >::get_label() const +{ + SharedAllocationHeader header ; + + Kokkos::Impl::DeepCopy< Kokkos::HostSpace , Kokkos::CudaSpace >( & header , RecordBase::head() , sizeof(SharedAllocationHeader) ); + + return std::string( header.m_label ); +} + +std::string +SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_label() const +{ + return std::string( RecordBase::head()->m_label ); +} + +std::string +SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_label() const +{ + return std::string( RecordBase::head()->m_label ); +} + +SharedAllocationRecord< Kokkos::CudaSpace , void > * +SharedAllocationRecord< Kokkos::CudaSpace , void >:: +allocate( const Kokkos::CudaSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + ) +{ + return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size ); +} + +SharedAllocationRecord< Kokkos::CudaUVMSpace , void > * +SharedAllocationRecord< Kokkos::CudaUVMSpace , void >:: +allocate( const Kokkos::CudaUVMSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + ) +{ + return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size ); +} + +SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > * +SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >:: +allocate( const Kokkos::CudaHostPinnedSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + ) +{ + return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size ); +} + +void +SharedAllocationRecord< Kokkos::CudaSpace , void >:: +deallocate( SharedAllocationRecord< void , void > * arg_rec ) +{ + delete static_cast<SharedAllocationRecord*>(arg_rec); +} + +void +SharedAllocationRecord< Kokkos::CudaUVMSpace , void >:: +deallocate( SharedAllocationRecord< void , void > * arg_rec ) +{ + delete static_cast<SharedAllocationRecord*>(arg_rec); +} + +void +SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >:: +deallocate( SharedAllocationRecord< void , void > * arg_rec ) +{ + delete static_cast<SharedAllocationRecord*>(arg_rec); +} + +SharedAllocationRecord< Kokkos::CudaSpace , void >:: +~SharedAllocationRecord() +{ + m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr + , SharedAllocationRecord< void , void >::m_alloc_size + ); +} + +SharedAllocationRecord< Kokkos::CudaUVMSpace , void >:: +~SharedAllocationRecord() +{ + m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr + , SharedAllocationRecord< void , void >::m_alloc_size + ); +} + +SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >:: +~SharedAllocationRecord() +{ + m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr + , SharedAllocationRecord< void , void >::m_alloc_size + ); +} + +SharedAllocationRecord< Kokkos::CudaSpace , void >:: +SharedAllocationRecord( const Kokkos::CudaSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + , const SharedAllocationRecord< void , void >::function_type arg_dealloc + ) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : SharedAllocationRecord< void , void > + ( & SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record + , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) ) + , sizeof(SharedAllocationHeader) + arg_alloc_size + , arg_dealloc + ) + , m_tex_obj( 0 ) + , m_space( arg_space ) +{ + SharedAllocationHeader header ; + + // Fill in the Header information + header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this ); + + strncpy( header.m_label + , arg_label.c_str() + , SharedAllocationHeader::maximum_label_length + ); + + // Copy to device memory + Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>::DeepCopy( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) ); +} + +SharedAllocationRecord< Kokkos::CudaUVMSpace , void >:: +SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + , const SharedAllocationRecord< void , void >::function_type arg_dealloc + ) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : SharedAllocationRecord< void , void > + ( & SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record + , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) ) + , sizeof(SharedAllocationHeader) + arg_alloc_size + , arg_dealloc + ) + , m_tex_obj( 0 ) + , m_space( arg_space ) +{ + // Fill in the Header information, directly accessible via UVM + + RecordBase::m_alloc_ptr->m_record = this ; + + strncpy( RecordBase::m_alloc_ptr->m_label + , arg_label.c_str() + , SharedAllocationHeader::maximum_label_length + ); +} + +SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >:: +SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + , const SharedAllocationRecord< void , void >::function_type arg_dealloc + ) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : SharedAllocationRecord< void , void > + ( & SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record + , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) ) + , sizeof(SharedAllocationHeader) + arg_alloc_size + , arg_dealloc + ) + , m_space( arg_space ) +{ + // Fill in the Header information, directly accessible via UVM + + RecordBase::m_alloc_ptr->m_record = this ; + + strncpy( RecordBase::m_alloc_ptr->m_label + , arg_label.c_str() + , SharedAllocationHeader::maximum_label_length + ); +} + +SharedAllocationRecord< Kokkos::CudaSpace , void > * +SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr ) +{ + using Header = SharedAllocationHeader ; + using RecordBase = SharedAllocationRecord< void , void > ; + using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ; + +#if 0 + // Copy the header from the allocation + SharedAllocationHeader head ; + + SharedAllocationHeader const * const head_cuda = Header::get_header( alloc_ptr ); + + Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , head_cuda , sizeof(SharedAllocationHeader) ); + + RecordCuda * const record = static_cast< RecordCuda * >( head.m_record ); + + if ( record->m_alloc_ptr != head_cuda ) { + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) ); + } + +#else + + // Iterate the list to search for the record among all allocations + // requires obtaining the root of the list and then locking the list. + + RecordCuda * const record = static_cast< RecordCuda * >( RecordBase::find( & s_root_record , alloc_ptr ) ); + + if ( record == 0 ) { + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) ); + } + +#endif + + return record ; +} + +SharedAllocationRecord< Kokkos::CudaUVMSpace , void > * +SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record( void * alloc_ptr ) +{ + using Header = SharedAllocationHeader ; + using RecordCuda = SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ; + + Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ; + + if ( h->m_record->m_alloc_ptr != h ) { + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) ); + } + + return static_cast< RecordCuda * >( h->m_record ); +} + +SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > * +SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void * alloc_ptr ) +{ + using Header = SharedAllocationHeader ; + using RecordCuda = SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > ; + + Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ; + + if ( h->m_record->m_alloc_ptr != h ) { + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) ); + } + + return static_cast< RecordCuda * >( h->m_record ); +} + +// Iterate records to print orphaned memory ... +void +SharedAllocationRecord< Kokkos::CudaSpace , void >:: +print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail ) +{ + SharedAllocationRecord< void , void > * r = & s_root_record ; + + char buffer[256] ; + + SharedAllocationHeader head ; + + if ( detail ) { + do { + if ( r->m_alloc_ptr ) { + Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) ); + } + else { + head.m_label[0] = 0 ; + } + + snprintf( buffer , 256 , "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n" + , reinterpret_cast<unsigned long>( r ) + , reinterpret_cast<unsigned long>( r->m_prev ) + , reinterpret_cast<unsigned long>( r->m_next ) + , reinterpret_cast<unsigned long>( r->m_alloc_ptr ) + , r->m_alloc_size + , r->m_count + , reinterpret_cast<unsigned long>( r->m_dealloc ) + , head.m_label + ); + std::cout << buffer ; + r = r->m_next ; + } while ( r != & s_root_record ); + } + else { + do { + if ( r->m_alloc_ptr ) { + + Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) ); + + snprintf( buffer , 256 , "Cuda [ 0x%.12lx + %ld ] %s\n" + , reinterpret_cast< unsigned long >( r->data() ) + , r->size() + , head.m_label + ); + } + else { + snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" ); + } + std::cout << buffer ; + r = r->m_next ; + } while ( r != & s_root_record ); + } +} + +void +SharedAllocationRecord< Kokkos::CudaUVMSpace , void >:: +print_records( std::ostream & s , const Kokkos::CudaUVMSpace & space , bool detail ) +{ + SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail ); +} + +void +SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >:: +print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bool detail ) +{ + SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail ); +} + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace { + __global__ void init_lock_array_kernel() { + unsigned i = blockIdx.x*blockDim.x + threadIdx.x; + + if(i<CUDA_SPACE_ATOMIC_MASK+1) + kokkos_impl_cuda_atomic_lock_array[i] = 0; + } +} + +namespace Impl { +int* lock_array_cuda_space_ptr(bool deallocate) { + static int* ptr = NULL; + if(deallocate) { + cudaFree(ptr); + ptr = NULL; + } + + if(ptr==NULL && !deallocate) + cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1)); + return ptr; +} + +void init_lock_array_cuda_space() { + int is_initialized = 0; + if(! is_initialized) { + int* lock_array_ptr = lock_array_cuda_space_ptr(); + cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) ); + init_lock_array_kernel<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>(); + } +} + +} +} +#endif // KOKKOS_HAVE_CUDA + diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp new file mode 100755 index 0000000000000000000000000000000000000000..e1314c0e511a96e82250d1ad39985f52547f5a51 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp @@ -0,0 +1,183 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP +#define KOKKOS_CUDA_ALLOCATION_TRACKING_HPP + +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#ifdef KOKKOS_HAVE_CUDA + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase + +namespace Kokkos { +namespace Impl { + +template< class DestructFunctor > +SharedAllocationRecord * +shared_allocation_record( Kokkos::CudaSpace const & arg_space + , void * const arg_alloc_ptr + , DestructFunctor const & arg_destruct ) +{ + SharedAllocationRecord * const record = SharedAllocationRecord::get_record( arg_alloc_ptr ); + + // assert: record != 0 + + // assert: sizeof(DestructFunctor) <= record->m_destruct_size + + // assert: record->m_destruct_function == 0 + + DestructFunctor * const functor = + reinterpret_cast< DestructFunctor * >( + reinterpret_cast< unsigned long >( record ) + sizeof(SharedAllocationRecord) ); + + new( functor ) DestructFunctor( arg_destruct ); + + record->m_destruct_functor = & shared_allocation_destroy< DestructFunctor > ; + + return record ; +} + + +/// class CudaUnmanagedAllocator +/// does nothing when deallocate(ptr,size) is called +struct CudaUnmanagedAllocator +{ + static const char * name() + { + return "Cuda Unmanaged Allocator"; + } + + static void deallocate(void * /*ptr*/, size_t /*size*/) {} + + static bool support_texture_binding() { return true; } +}; + +/// class CudaUnmanagedAllocator +/// does nothing when deallocate(ptr,size) is called +struct CudaUnmanagedUVMAllocator +{ + static const char * name() + { + return "Cuda Unmanaged UVM Allocator"; + } + + static void deallocate(void * /*ptr*/, size_t /*size*/) {} + + static bool support_texture_binding() { return true; } +}; + +/// class CudaUnmanagedHostAllocator +/// does nothing when deallocate(ptr,size) is called +class CudaUnmanagedHostAllocator +{ +public: + static const char * name() + { + return "Cuda Unmanaged Host Allocator"; + } + // Unmanaged deallocate does nothing + static void deallocate(void * /*ptr*/, size_t /*size*/) {} +}; + +/// class CudaMallocAllocator +class CudaMallocAllocator +{ +public: + static const char * name() + { + return "Cuda Malloc Allocator"; + } + + static void* allocate(size_t size); + + static void deallocate(void * ptr, size_t); + + static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); + + static bool support_texture_binding() { return true; } +}; + +/// class CudaUVMAllocator +class CudaUVMAllocator +{ +public: + static const char * name() + { + return "Cuda UVM Allocator"; + } + + static void* allocate(size_t size); + + static void deallocate(void * ptr, size_t); + + static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); + + static bool support_texture_binding() { return true; } +}; + +/// class CudaHostAllocator +class CudaHostAllocator +{ +public: + static const char * name() + { + return "Cuda Host Allocator"; + } + + static void* allocate(size_t size); + + static void deallocate(void * ptr, size_t); + + static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); +}; + + +}} // namespace Kokkos::Impl + +#endif //KOKKOS_HAVE_CUDA + +#endif // #ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP + diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp new file mode 100755 index 0000000000000000000000000000000000000000..8c8c5e47a5b13eebc7c09b8e69d5fb728b4988c4 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp @@ -0,0 +1,192 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#ifdef KOKKOS_HAVE_CUDA + +#include <impl/Kokkos_Error.hpp> +#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp> +#include <Cuda/Kokkos_Cuda_Error.hpp> + +#include <sstream> + +namespace Kokkos { namespace Impl { + + +/*--------------------------------------------------------------------------*/ +TextureAttribute::TextureAttribute( void * const alloc_ptr + , size_t alloc_size + , cudaChannelFormatDesc const & desc + ) + : m_tex_obj(0) +{ + cuda_device_synchronize(); + + struct cudaResourceDesc resDesc ; + struct cudaTextureDesc texDesc ; + + memset( & resDesc , 0 , sizeof(resDesc) ); + memset( & texDesc , 0 , sizeof(texDesc) ); + + resDesc.resType = cudaResourceTypeLinear ; + resDesc.res.linear.desc = desc ; + resDesc.res.linear.sizeInBytes = alloc_size ; + resDesc.res.linear.devPtr = alloc_ptr ; + + CUDA_SAFE_CALL( cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL) ); + + cuda_device_synchronize(); +} + + +TextureAttribute::~TextureAttribute() +{ + if (m_tex_obj) { + cudaDestroyTextureObject( m_tex_obj ); + } +} + +/*--------------------------------------------------------------------------*/ + +void * CudaMallocAllocator::allocate( size_t size ) +{ + void * ptr = NULL; + + CUDA_SAFE_CALL( cudaMalloc( &ptr, size ) ); + + return ptr; +} + +void CudaMallocAllocator::deallocate( void * ptr, size_t /*size*/ ) +{ + try { + CUDA_SAFE_CALL( cudaFree( ptr ) ); + } catch(...) {} +} + +void * CudaMallocAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size) +{ + void * ptr = old_ptr; + if (old_size != new_size) { + ptr = allocate( new_size ); + size_t copy_size = old_size < new_size ? old_size : new_size; + + CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) ); + + deallocate( old_ptr, old_size ); + } + return ptr; +} + +/*--------------------------------------------------------------------------*/ + +void * CudaUVMAllocator::allocate( size_t size ) +{ +#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) + void * ptr = NULL; + CUDA_SAFE_CALL( cudaMallocManaged( &ptr, size, cudaMemAttachGlobal ) ); + return ptr; +#else + throw_runtime_exception( "CUDA VERSION does not support UVM" ); + return NULL; +#endif +} + +void CudaUVMAllocator::deallocate( void * ptr, size_t /*size*/ ) +{ + try { + CUDA_SAFE_CALL( cudaFree( ptr ) ); + } catch(...) {} +} + +void * CudaUVMAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size) +{ + void * ptr = old_ptr; + if (old_size != new_size) { + ptr = allocate( new_size ); + size_t copy_size = old_size < new_size ? old_size : new_size; + + CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) ); + + deallocate( old_ptr, old_size ); + } + return ptr; +} + +/*--------------------------------------------------------------------------*/ + +void * CudaHostAllocator::allocate( size_t size ) +{ + void * ptr = NULL; + CUDA_SAFE_CALL( cudaHostAlloc( &ptr , size , cudaHostAllocDefault ) ); + return ptr; +} + +void CudaHostAllocator::deallocate( void * ptr, size_t /*size*/ ) +{ + try { + CUDA_SAFE_CALL( cudaFreeHost( ptr ) ); + } catch(...) {} +} + +void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size) +{ + void * ptr = old_ptr; + if (old_size != new_size) { + ptr = allocate( new_size ); + size_t copy_size = old_size < new_size ? old_size : new_size; + + CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyHostToHost ) ); + + deallocate( old_ptr, old_size ); + } + return ptr; +} + +/*--------------------------------------------------------------------------*/ + +}} // namespace Kokkos::Impl + +#endif //KOKKOS_HAVE_CUDA diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp new file mode 100755 index 0000000000000000000000000000000000000000..86fe1c901bcbe62dd0f1e97e9b933a17da6283d7 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp @@ -0,0 +1,187 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_BASIC_ALLOCATORS_HPP +#define KOKKOS_CUDA_BASIC_ALLOCATORS_HPP + +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#ifdef KOKKOS_HAVE_CUDA + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase + +namespace Kokkos { namespace Impl { + + +// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t' +// to be an 'unsigned long long'. This chould change with +// future version of Cuda and this typedef would have to +// change accordingly. + +#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION ) + +typedef enable_if< + sizeof(::cudaTextureObject_t) == sizeof(const void *) , + ::cudaTextureObject_t >::type cuda_texture_object_type ; + +#else + +typedef const void * cuda_texture_object_type ; + +#endif + + +struct TextureAttribute : public AllocatorAttributeBase +{ + cuda_texture_object_type m_tex_obj ; + + TextureAttribute( void * const alloc_ptr + , size_t alloc_size + , cudaChannelFormatDesc const & desc + ); + + ~TextureAttribute(); +}; + + +/// class CudaUnmanagedAllocator +/// does nothing when deallocate(ptr,size) is called +struct CudaUnmanagedAllocator +{ + static const char * name() + { + return "Cuda Unmanaged Allocator"; + } + + static void deallocate(void * /*ptr*/, size_t /*size*/) {} + + static bool support_texture_binding() { return true; } +}; + +/// class CudaUnmanagedAllocator +/// does nothing when deallocate(ptr,size) is called +struct CudaUnmanagedUVMAllocator +{ + static const char * name() + { + return "Cuda Unmanaged UVM Allocator"; + } + + static void deallocate(void * /*ptr*/, size_t /*size*/) {} + + static bool support_texture_binding() { return true; } +}; + +/// class CudaUnmanagedHostAllocator +/// does nothing when deallocate(ptr,size) is called +class CudaUnmanagedHostAllocator +{ +public: + static const char * name() + { + return "Cuda Unmanaged Host Allocator"; + } + // Unmanaged deallocate does nothing + static void deallocate(void * /*ptr*/, size_t /*size*/) {} +}; + +/// class CudaMallocAllocator +class CudaMallocAllocator +{ +public: + static const char * name() + { + return "Cuda Malloc Allocator"; + } + + static void* allocate(size_t size); + + static void deallocate(void * ptr, size_t); + + static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); + + static bool support_texture_binding() { return true; } +}; + +/// class CudaUVMAllocator +class CudaUVMAllocator +{ +public: + static const char * name() + { + return "Cuda UVM Allocator"; + } + + static void* allocate(size_t size); + + static void deallocate(void * ptr, size_t); + + static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); + + static bool support_texture_binding() { return true; } +}; + +/// class CudaHostAllocator +class CudaHostAllocator +{ +public: + static const char * name() + { + return "Cuda Host Allocator"; + } + + static void* allocate(size_t size); + + static void deallocate(void * ptr, size_t); + + static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); +}; + + +}} // namespace Kokkos::Impl + +#endif //KOKKOS_HAVE_CUDA + +#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp new file mode 100755 index 0000000000000000000000000000000000000000..a0b29ddc2b270212f9c8b9d18e6ee394b9a61b39 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp @@ -0,0 +1,69 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_ERROR_HPP +#define KOKKOS_CUDA_ERROR_HPP + +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#ifdef KOKKOS_HAVE_CUDA + +namespace Kokkos { namespace Impl { + +void cuda_device_synchronize(); + +void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 ); + +inline void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0) +{ + if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); } +} + +#define CUDA_SAFE_CALL( call ) \ + Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ ) + +}} // namespace Kokkos::Impl + +#endif //KOKKOS_HAVE_CUDA +#endif //KOKKOS_CUDA_ERROR_HPP diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp new file mode 100755 index 0000000000000000000000000000000000000000..b7c3a62d39934369e1ec1a5089f13abf1dfa94a5 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp @@ -0,0 +1,678 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/*--------------------------------------------------------------------------*/ +/* Kokkos interfaces */ + +#include <Kokkos_Core.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#ifdef KOKKOS_HAVE_CUDA + +#include <Cuda/Kokkos_Cuda_Error.hpp> +#include <Cuda/Kokkos_Cuda_Internal.hpp> +#include <impl/Kokkos_AllocationTracker.hpp> +#include <impl/Kokkos_Error.hpp> + +/*--------------------------------------------------------------------------*/ +/* Standard 'C' libraries */ +#include <stdlib.h> + +/* Standard 'C++' libraries */ +#include <vector> +#include <iostream> +#include <sstream> +#include <string> + +#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE +__device__ __constant__ +Kokkos::Impl::CudaTraits::ConstantGlobalBufferType +kokkos_impl_cuda_constant_memory_buffer ; +#endif + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +namespace { + +__global__ +void query_cuda_kernel_arch( int * d_arch ) +{ +#if defined( __CUDA_ARCH__ ) + *d_arch = __CUDA_ARCH__ ; +#else + *d_arch = 0 ; +#endif +} + +/** Query what compute capability is actually launched to the device: */ +int cuda_kernel_arch() +{ + int * d_arch = 0 ; + cudaMalloc( (void **) & d_arch , sizeof(int) ); + query_cuda_kernel_arch<<<1,1>>>( d_arch ); + int arch = 0 ; + cudaMemcpy( & arch , d_arch , sizeof(int) , cudaMemcpyDefault ); + cudaFree( d_arch ); + return arch ; +} + +bool cuda_launch_blocking() +{ + const char * env = getenv("CUDA_LAUNCH_BLOCKING"); + + if (env == 0) return false; + + return atoi(env); +} + +} + +void cuda_device_synchronize() +{ +// static const bool launch_blocking = cuda_launch_blocking(); + +// if (!launch_blocking) { + CUDA_SAFE_CALL( cudaDeviceSynchronize() ); +// } +} + +void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line ) +{ + std::ostringstream out ; + out << name << " error( " << cudaGetErrorName(e) << "): " << cudaGetErrorString(e); + if (file) { + out << " " << file << ":" << line; + } + throw_runtime_exception( out.str() ); +} + +//---------------------------------------------------------------------------- +// Some significant cuda device properties: +// +// cudaDeviceProp::name : Text label for device +// cudaDeviceProp::major : Device major number +// cudaDeviceProp::minor : Device minor number +// cudaDeviceProp::warpSize : number of threads per warp +// cudaDeviceProp::multiProcessorCount : number of multiprocessors +// cudaDeviceProp::sharedMemPerBlock : capacity of shared memory per block +// cudaDeviceProp::totalConstMem : capacity of constant memory +// cudaDeviceProp::totalGlobalMem : capacity of global memory +// cudaDeviceProp::maxGridSize[3] : maximum grid size + +// +// Section 4.4.2.4 of the CUDA Toolkit Reference Manual +// +// struct cudaDeviceProp { +// char name[256]; +// size_t totalGlobalMem; +// size_t sharedMemPerBlock; +// int regsPerBlock; +// int warpSize; +// size_t memPitch; +// int maxThreadsPerBlock; +// int maxThreadsDim[3]; +// int maxGridSize[3]; +// size_t totalConstMem; +// int major; +// int minor; +// int clockRate; +// size_t textureAlignment; +// int deviceOverlap; +// int multiProcessorCount; +// int kernelExecTimeoutEnabled; +// int integrated; +// int canMapHostMemory; +// int computeMode; +// int concurrentKernels; +// int ECCEnabled; +// int pciBusID; +// int pciDeviceID; +// int tccDriver; +// int asyncEngineCount; +// int unifiedAddressing; +// int memoryClockRate; +// int memoryBusWidth; +// int l2CacheSize; +// int maxThreadsPerMultiProcessor; +// }; + + +namespace { + + + +class CudaInternalDevices { +public: + enum { MAXIMUM_DEVICE_COUNT = 8 }; + struct cudaDeviceProp m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ; + int m_cudaDevCount ; + + CudaInternalDevices(); + + static const CudaInternalDevices & singleton(); +}; + +CudaInternalDevices::CudaInternalDevices() +{ + // See 'cudaSetDeviceFlags' for host-device thread interaction + // Section 4.4.2.6 of the CUDA Toolkit Reference Manual + + CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) ); + + for ( int i = 0 ; i < m_cudaDevCount ; ++i ) { + CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) ); + } +} + +const CudaInternalDevices & CudaInternalDevices::singleton() +{ + static CudaInternalDevices self ; return self ; +} + +} + +//---------------------------------------------------------------------------- + +class CudaInternal { +private: + + CudaInternal( const CudaInternal & ); + CudaInternal & operator = ( const CudaInternal & ); + + AllocationTracker m_scratchFlagsTracker; + AllocationTracker m_scratchSpaceTracker; + AllocationTracker m_scratchUnifiedTracker; + + +public: + + typedef Cuda::size_type size_type ; + + int m_cudaDev ; + int m_cudaArch ; + unsigned m_maxWarpCount ; + unsigned m_maxBlock ; + unsigned m_maxSharedWords ; + size_type m_scratchSpaceCount ; + size_type m_scratchFlagsCount ; + size_type m_scratchUnifiedCount ; + size_type m_scratchUnifiedSupported ; + size_type m_streamCount ; + size_type * m_scratchSpace ; + size_type * m_scratchFlags ; + size_type * m_scratchUnified ; + cudaStream_t * m_stream ; + + + static CudaInternal & singleton(); + + int verify_is_initialized( const char * const label ) const ; + + int is_initialized() const + { return 0 != m_scratchSpace && 0 != m_scratchFlags ; } + + void initialize( int cuda_device_id , int stream_count ); + void finalize(); + + void print_configuration( std::ostream & ) const ; + + ~CudaInternal(); + + CudaInternal() + : m_cudaDev( -1 ) + , m_cudaArch( -1 ) + , m_maxWarpCount( 0 ) + , m_maxBlock( 0 ) + , m_maxSharedWords( 0 ) + , m_scratchSpaceCount( 0 ) + , m_scratchFlagsCount( 0 ) + , m_scratchUnifiedCount( 0 ) + , m_scratchUnifiedSupported( 0 ) + , m_streamCount( 0 ) + , m_scratchSpace( 0 ) + , m_scratchFlags( 0 ) + , m_scratchUnified( 0 ) + , m_stream( 0 ) + {} + + size_type * scratch_space( const size_type size ); + size_type * scratch_flags( const size_type size ); + size_type * scratch_unified( const size_type size ); +}; + +//---------------------------------------------------------------------------- + + +void CudaInternal::print_configuration( std::ostream & s ) const +{ + const CudaInternalDevices & dev_info = CudaInternalDevices::singleton(); + +#if defined( KOKKOS_HAVE_CUDA ) + s << "macro KOKKOS_HAVE_CUDA : defined" << std::endl ; +#endif +#if defined( CUDA_VERSION ) + s << "macro CUDA_VERSION = " << CUDA_VERSION + << " = version " << CUDA_VERSION / 1000 + << "." << ( CUDA_VERSION % 1000 ) / 10 + << std::endl ; +#endif + + for ( int i = 0 ; i < dev_info.m_cudaDevCount ; ++i ) { + s << "Kokkos::Cuda[ " << i << " ] " + << dev_info.m_cudaProp[i].name + << " capability " << dev_info.m_cudaProp[i].major << "." << dev_info.m_cudaProp[i].minor + << ", Total Global Memory: " << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem) + << ", Shared Memory per Block: " << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock); + if ( m_cudaDev == i ) s << " : Selected" ; + s << std::endl ; + } +} + +//---------------------------------------------------------------------------- + +CudaInternal::~CudaInternal() +{ + if ( m_stream || + m_scratchSpace || + m_scratchFlags || + m_scratchUnified ) { + std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" + << std::endl ; + std::cerr.flush(); + } + + m_cudaDev = -1 ; + m_cudaArch = -1 ; + m_maxWarpCount = 0 ; + m_maxBlock = 0 ; + m_maxSharedWords = 0 ; + m_scratchSpaceCount = 0 ; + m_scratchFlagsCount = 0 ; + m_scratchUnifiedCount = 0 ; + m_scratchUnifiedSupported = 0 ; + m_streamCount = 0 ; + m_scratchSpace = 0 ; + m_scratchFlags = 0 ; + m_scratchUnified = 0 ; + m_stream = 0 ; +} + +int CudaInternal::verify_is_initialized( const char * const label ) const +{ + if ( m_cudaDev < 0 ) { + std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized" << std::endl ; + } + return 0 <= m_cudaDev ; +} + +CudaInternal & CudaInternal::singleton() +{ + static CudaInternal self ; + return self ; +} + +void CudaInternal::initialize( int cuda_device_id , int stream_count ) +{ + enum { WordSize = sizeof(size_type) }; + + if ( ! HostSpace::execution_space::is_initialized() ) { + const std::string msg("Cuda::initialize ERROR : HostSpace::execution_space is not initialized"); + throw_runtime_exception( msg ); + } + + const CudaInternalDevices & dev_info = CudaInternalDevices::singleton(); + + const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ; + + const bool ok_id = 0 <= cuda_device_id && + cuda_device_id < dev_info.m_cudaDevCount ; + + // Need device capability 2.0 or better + + const bool ok_dev = ok_id && + ( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major && + 0 <= dev_info.m_cudaProp[ cuda_device_id ].minor ); + + if ( ok_init && ok_dev ) { + + const struct cudaDeviceProp & cudaProp = + dev_info.m_cudaProp[ cuda_device_id ]; + + m_cudaDev = cuda_device_id ; + + CUDA_SAFE_CALL( cudaSetDevice( m_cudaDev ) ); + CUDA_SAFE_CALL( cudaDeviceReset() ); + Kokkos::Impl::cuda_device_synchronize(); + + // Query what compute capability architecture a kernel executes: + m_cudaArch = cuda_kernel_arch(); + + if ( m_cudaArch != cudaProp.major * 100 + cudaProp.minor * 10 ) { + std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability " + << ( m_cudaArch / 100 ) << "." << ( ( m_cudaArch % 100 ) / 10 ) + << " on device with compute capability " + << cudaProp.major << "." << cudaProp.minor + << " , this will likely reduce potential performance." + << std::endl ; + } + + //---------------------------------- + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + + // HCE 2012-February : + // Found bug in CUDA 4.1 that sometimes a kernel launch would fail + // if the thread count == 1024 and a functor is passed to the kernel. + // Copying the kernel to constant memory and then launching with + // thread count == 1024 would work fine. + // + // HCE 2012-October : + // All compute capabilities support at least 16 warps (512 threads). + // However, we have found that 8 warps typically gives better performance. + + m_maxWarpCount = 8 ; + + // m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ; + + if ( Impl::CudaTraits::WarpSize < m_maxWarpCount ) { + m_maxWarpCount = Impl::CudaTraits::WarpSize ; + } + + m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize ; + + //---------------------------------- + // Maximum number of blocks: + + m_maxBlock = m_cudaArch < 300 ? 65535 : cudaProp.maxGridSize[0] ; + + //---------------------------------- + + m_scratchUnifiedSupported = cudaProp.unifiedAddressing ; + + if ( ! m_scratchUnifiedSupported ) { + std::cout << "Kokkos::Cuda device " + << cudaProp.name << " capability " + << cudaProp.major << "." << cudaProp.minor + << " does not support unified virtual address space" + << std::endl ; + } + + //---------------------------------- + // Multiblock reduction uses scratch flags for counters + // and scratch space for partial reduction values. + // Allocate some initial space. This will grow as needed. + + { + const unsigned reduce_block_count = m_maxWarpCount * Impl::CudaTraits::WarpSize ; + + (void) scratch_unified( 16 * sizeof(size_type) ); + (void) scratch_flags( reduce_block_count * 2 * sizeof(size_type) ); + (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) ); + } + //---------------------------------- + + if ( stream_count ) { + m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) ); + m_streamCount = stream_count ; + for ( size_type i = 0 ; i < m_streamCount ; ++i ) m_stream[i] = 0 ; + } + } + else { + + std::ostringstream msg ; + msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED" ; + + if ( ! ok_init ) { + msg << " : Already initialized" ; + } + if ( ! ok_id ) { + msg << " : Device identifier out of range " + << "[0.." << dev_info.m_cudaDevCount << "]" ; + } + else if ( ! ok_dev ) { + msg << " : Device " ; + msg << dev_info.m_cudaProp[ cuda_device_id ].major ; + msg << "." ; + msg << dev_info.m_cudaProp[ cuda_device_id ].minor ; + msg << " has insufficient capability, required 2.0 or better" ; + } + Kokkos::Impl::throw_runtime_exception( msg.str() ); + } + + // Init the array for used for arbitrarily sized atomics + Impl::init_lock_array_cuda_space(); + +} + +//---------------------------------------------------------------------------- + +typedef Cuda::size_type ScratchGrain[ Impl::CudaTraits::WarpSize ] ; +enum { sizeScratchGrain = sizeof(ScratchGrain) }; + + +Cuda::size_type * +CudaInternal::scratch_flags( const Cuda::size_type size ) +{ + if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) { + + + m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ; + + m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount ); + m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr()); + + CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) ); + } + + return m_scratchFlags ; +} + +Cuda::size_type * +CudaInternal::scratch_space( const Cuda::size_type size ) +{ + if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) { + + m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ; + + m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount ); + m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr()); + + } + + return m_scratchSpace ; +} + +Cuda::size_type * +CudaInternal::scratch_unified( const Cuda::size_type size ) +{ + if ( verify_is_initialized("scratch_unified") && + m_scratchUnifiedSupported && m_scratchUnifiedCount * sizeScratchGrain < size ) { + + m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ; + + m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount ); + m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() ); + } + + return m_scratchUnified ; +} + +//---------------------------------------------------------------------------- + +void CudaInternal::finalize() +{ + if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) { + + lock_array_cuda_space_ptr(true); + if ( m_stream ) { + for ( size_type i = 1 ; i < m_streamCount ; ++i ) { + cudaStreamDestroy( m_stream[i] ); + m_stream[i] = 0 ; + } + ::free( m_stream ); + } + + m_scratchSpaceTracker.clear(); + m_scratchFlagsTracker.clear(); + m_scratchUnifiedTracker.clear(); + + m_cudaDev = -1 ; + m_maxWarpCount = 0 ; + m_maxBlock = 0 ; + m_maxSharedWords = 0 ; + m_scratchSpaceCount = 0 ; + m_scratchFlagsCount = 0 ; + m_scratchUnifiedCount = 0 ; + m_streamCount = 0 ; + m_scratchSpace = 0 ; + m_scratchFlags = 0 ; + m_scratchUnified = 0 ; + m_stream = 0 ; + } +} + +//---------------------------------------------------------------------------- + +Cuda::size_type cuda_internal_maximum_warp_count() +{ return CudaInternal::singleton().m_maxWarpCount ; } + +Cuda::size_type cuda_internal_maximum_grid_count() +{ return CudaInternal::singleton().m_maxBlock ; } + +Cuda::size_type cuda_internal_maximum_shared_words() +{ return CudaInternal::singleton().m_maxSharedWords ; } + +Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size ) +{ return CudaInternal::singleton().scratch_space( size ); } + +Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size ) +{ return CudaInternal::singleton().scratch_flags( size ); } + +Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size ) +{ return CudaInternal::singleton().scratch_unified( size ); } + + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +Cuda::size_type Cuda::detect_device_count() +{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; } + +int Cuda::is_initialized() +{ return Impl::CudaInternal::singleton().is_initialized(); } + +void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances ) +{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); } + +std::vector<unsigned> +Cuda::detect_device_arch() +{ + const Impl::CudaInternalDevices & s = Impl::CudaInternalDevices::singleton(); + + std::vector<unsigned> output( s.m_cudaDevCount ); + + for ( int i = 0 ; i < s.m_cudaDevCount ; ++i ) { + output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor ; + } + + return output ; +} + +Cuda::size_type Cuda::device_arch() +{ + const int dev_id = Impl::CudaInternal::singleton().m_cudaDev ; + + int dev_arch = 0 ; + + if ( 0 <= dev_id ) { + const struct cudaDeviceProp & cudaProp = + Impl::CudaInternalDevices::singleton().m_cudaProp[ dev_id ] ; + + dev_arch = cudaProp.major * 100 + cudaProp.minor ; + } + + return dev_arch ; +} + +void Cuda::finalize() +{ Impl::CudaInternal::singleton().finalize(); } + +Cuda::Cuda() + : m_device( Impl::CudaInternal::singleton().m_cudaDev ) + , m_stream( 0 ) +{ + Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" ); +} + +Cuda::Cuda( const int instance_id ) + : m_device( Impl::CudaInternal::singleton().m_cudaDev ) + , m_stream( + Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" ) + ? Impl::CudaInternal::singleton().m_stream[ instance_id % Impl::CudaInternal::singleton().m_streamCount ] + : 0 ) +{} + +void Cuda::print_configuration( std::ostream & s , const bool ) +{ Impl::CudaInternal::singleton().print_configuration( s ); } + +bool Cuda::sleep() { return false ; } + +bool Cuda::wake() { return true ; } + +void Cuda::fence() +{ + Kokkos::Impl::cuda_device_synchronize(); +} + +} // namespace Kokkos + +#endif // KOKKOS_HAVE_CUDA +//---------------------------------------------------------------------------- + diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp new file mode 100755 index 0000000000000000000000000000000000000000..dd8a08729b25792f9a62be0e1afbfedbbfcebd08 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp @@ -0,0 +1,165 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_INTERNAL_HPP +#define KOKKOS_CUDA_INTERNAL_HPP + +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#ifdef KOKKOS_HAVE_CUDA + +#include <Cuda/Kokkos_Cuda_Error.hpp> + +namespace Kokkos { namespace Impl { + + +template<class DriverType> +int cuda_get_max_block_size(const typename DriverType::functor_type & f) { +#if ( CUDA_VERSION < 6050 ) + return 256; +#else + bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ); + + int numBlocks; + if(Large) { + int blockSize=32; + int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize ); + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks, + cuda_parallel_launch_constant_memory<DriverType>, + blockSize, + sharedmem); + + while (blockSize<1024 && numBlocks>0) { + blockSize*=2; + sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize ); + + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks, + cuda_parallel_launch_constant_memory<DriverType>, + blockSize, + sharedmem); + } + if(numBlocks>0) return blockSize; + else return blockSize/2; + } else { + int blockSize=32; + int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize ); + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks, + cuda_parallel_launch_local_memory<DriverType>, + blockSize, + sharedmem); + + while (blockSize<1024 && numBlocks>0) { + blockSize*=2; + sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize ); + + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks, + cuda_parallel_launch_local_memory<DriverType>, + blockSize, + sharedmem); + } + if(numBlocks>0) return blockSize; + else return blockSize/2; + } +#endif +} + +template<class DriverType> +int cuda_get_opt_block_size(const typename DriverType::functor_type & f) { +#if ( CUDA_VERSION < 6050 ) + return 256; +#else + bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ); + + int blockSize=16; + int numBlocks; + int sharedmem; + int maxOccupancy=0; + int bestBlockSize=0; + + if(Large) { + while(blockSize<1024) { + blockSize*=2; + + //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far + sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize ); + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks, + cuda_parallel_launch_constant_memory<DriverType>, + blockSize, + sharedmem); + if(maxOccupancy < numBlocks*blockSize) { + maxOccupancy = numBlocks*blockSize; + bestBlockSize = blockSize; + } + } + } else { + while(blockSize<1024) { + blockSize*=2; + sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize ); + + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks, + cuda_parallel_launch_local_memory<DriverType>, + blockSize, + sharedmem); + + if(maxOccupancy < numBlocks*blockSize) { + maxOccupancy = numBlocks*blockSize; + bestBlockSize = blockSize; + } + } + } + return bestBlockSize; +#endif +} + +}} // namespace Kokkos::Impl + +#endif // KOKKOS_HAVE_CUDA +#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */ + diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp new file mode 100755 index 0000000000000000000000000000000000000000..ce33c978c711051694eb052fcce29b07ae081335 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -0,0 +1,1799 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_PARALLEL_HPP +#define KOKKOS_CUDA_PARALLEL_HPP + +#include <iostream> +#include <stdio.h> + +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA ) + +#include <utility> +#include <Kokkos_Parallel.hpp> + +#include <Cuda/Kokkos_CudaExec.hpp> +#include <Cuda/Kokkos_Cuda_ReduceScan.hpp> +#include <Cuda/Kokkos_Cuda_Internal.hpp> +#include <Kokkos_Vectorization.hpp> + +#ifdef KOKKOSP_ENABLE_PROFILING +#include <impl/Kokkos_Profiling_Interface.hpp> +#include <typeinfo> +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< typename Type > +struct CudaJoinFunctor { + typedef Type value_type ; + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & update , + volatile const value_type & input ) + { update += input ; } +}; + +class CudaTeamMember { +private: + + typedef Kokkos::Cuda execution_space ; + typedef execution_space::scratch_memory_space scratch_memory_space ; + + void * m_team_reduce ; + scratch_memory_space m_team_shared ; + int m_league_rank ; + int m_league_size ; + +public: + +#if defined( __CUDA_ARCH__ ) + + __device__ inline + const execution_space::scratch_memory_space & team_shmem() const + { return m_team_shared ; } + + __device__ inline int league_rank() const { return m_league_rank ; } + __device__ inline int league_size() const { return m_league_size ; } + __device__ inline int team_rank() const { return threadIdx.y ; } + __device__ inline int team_size() const { return blockDim.y ; } + + __device__ inline void team_barrier() const { __syncthreads(); } + + template<class ValueType> + __device__ inline void team_broadcast(ValueType& value, const int& thread_id) const { + __shared__ ValueType sh_val; + if(threadIdx.x == 0 && threadIdx.y == thread_id) { + sh_val = val; + } + team_barrier(); + val = sh_val; + } + +#ifdef KOKKOS_HAVE_CXX11 + template< class ValueType, class JoinOp > + __device__ inline + typename JoinOp::value_type team_reduce( const ValueType & value + , const JoinOp & op_in ) const + { + typedef JoinLambdaAdapter<ValueType,JoinOp> JoinOpFunctor ; + const JoinOpFunctor op(op_in); + ValueType * const base_data = (ValueType *) m_team_reduce ; +#else + template< class JoinOp > + __device__ inline + typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value + , const JoinOp & op ) const + { + typedef JoinOp JoinOpFunctor ; + typename JoinOp::value_type * const base_data = (typename JoinOp::value_type *) m_team_reduce ; +#endif + + __syncthreads(); // Don't write in to shared data until all threads have entered this function + + if ( 0 == threadIdx.y ) { base_data[0] = 0 ; } + + base_data[ threadIdx.y ] = value ; + + Impl::cuda_intra_block_reduce_scan<false,JoinOpFunctor,void>( op , base_data ); + + return base_data[ blockDim.y - 1 ]; + } + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template< typename Type > + __device__ inline Type team_scan( const Type & value , Type * const global_accum ) const + { + Type * const base_data = (Type *) m_team_reduce ; + + __syncthreads(); // Don't write in to shared data until all threads have entered this function + + if ( 0 == threadIdx.y ) { base_data[0] = 0 ; } + + base_data[ threadIdx.y + 1 ] = value ; + + Impl::cuda_intra_block_reduce_scan<true,Impl::CudaJoinFunctor<Type>,void>( Impl::CudaJoinFunctor<Type>() , base_data + 1 ); + + if ( global_accum ) { + if ( blockDim.y == threadIdx.y + 1 ) { + base_data[ blockDim.y ] = atomic_fetch_add( global_accum , base_data[ blockDim.y ] ); + } + __syncthreads(); // Wait for atomic + base_data[ threadIdx.y ] += base_data[ blockDim.y ] ; + } + + return base_data[ threadIdx.y ]; + } + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template< typename Type > + __device__ inline Type team_scan( const Type & value ) const + { return this->template team_scan<Type>( value , 0 ); } + + //---------------------------------------- + // Private for the driver + + __device__ inline + CudaTeamMember( void * shared + , const int shared_begin + , const int shared_size + , const int arg_league_rank + , const int arg_league_size ) + : m_team_reduce( shared ) + , m_team_shared( ((char *)shared) + shared_begin , shared_size ) + , m_league_rank( arg_league_rank ) + , m_league_size( arg_league_size ) + {} + +#else + + const execution_space::scratch_memory_space & team_shmem() const {return m_team_shared;} + + int league_rank() const {return 0;} + int league_size() const {return 1;} + int team_rank() const {return 0;} + int team_size() const {return 1;} + + void team_barrier() const {} + template<class ValueType> + void team_broadcast(ValueType& value, const int& thread_id) const {} + + template< class JoinOp > + typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value + , const JoinOp & op ) const {return typename JoinOp::value_type();} + + template< typename Type > + Type team_scan( const Type & value , Type * const global_accum ) const {return Type();} + + template< typename Type > + Type team_scan( const Type & value ) const {return Type();} + + //---------------------------------------- + // Private for the driver + + CudaTeamMember( void * shared + , const int shared_begin + , const int shared_end + , const int arg_league_rank + , const int arg_league_size ); + +#endif /* #if ! defined( __CUDA_ARCH__ ) */ + +}; + +} // namespace Impl + +template< class Arg0 , class Arg1 > +class TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda > +{ +private: + + enum { MAX_WARP = 8 }; + + const int m_league_size ; + const int m_team_size ; + const int m_vector_length ; + +public: + + //! Tag this class as a kokkos execution policy + typedef TeamPolicy execution_policy ; + + //! Execution space of this execution policy + typedef Kokkos::Cuda execution_space ; + + typedef typename + Impl::if_c< ! Impl::is_same< Kokkos::Cuda , Arg0 >::value , Arg0 , Arg1 >::type + work_tag ; + + //---------------------------------------- + + template< class FunctorType > + inline static + int team_size_max( const FunctorType & functor ) + { + int n = MAX_WARP * Impl::CudaTraits::WarpSize ; + + for ( ; n ; n >>= 1 ) { + const int shmem_size = + /* for global reduce */ Impl::cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( functor , n ) + /* for team reduce */ + ( n + 2 ) * sizeof(double) + /* for team shared */ + Impl::FunctorTeamShmemSize< FunctorType >::value( functor , n ); + + if ( shmem_size < Impl::CudaTraits::SharedMemoryCapacity ) break ; + } + + return n ; + } + + template< class FunctorType > + static int team_size_recommended( const FunctorType & functor ) + { return team_size_max( functor ); } + + template< class FunctorType > + static int team_size_recommended( const FunctorType & functor , const int vector_length) + { + int max = team_size_max( functor )/vector_length; + if(max<1) max = 1; + return max; + } + + inline static + int vector_length_max() + { return Impl::CudaTraits::WarpSize; } + + //---------------------------------------- + + inline int vector_length() const { return m_vector_length ; } + inline int team_size() const { return m_team_size ; } + inline int league_size() const { return m_league_size ; } + + /** \brief Specify league size, request team size */ + TeamPolicy( execution_space & , int league_size_ , int team_size_request , int vector_length_request = 1 ) + : m_league_size( league_size_ ) + , m_team_size( team_size_request ) + , m_vector_length ( vector_length_request ) + { + // Allow only power-of-two vector_length + int check = 0; + for(int k = 1; k <= vector_length_max(); k*=2) + if(k == vector_length_request) + check = 1; + if(!check) + Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy."); + + // Make sure league size is permissable + if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) + Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space."); + } + + TeamPolicy( int league_size_ , int team_size_request , int vector_length_request = 1 ) + : m_league_size( league_size_ ) + , m_team_size( team_size_request ) + , m_vector_length ( vector_length_request ) + { + // Allow only power-of-two vector_length + int check = 0; + for(int k = 1; k <= vector_length_max(); k*=2) + if(k == vector_length_request) + check = 1; + if(!check) + Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy."); + + // Make sure league size is permissable + if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) + Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space."); + + } + + typedef Kokkos::Impl::CudaTeamMember member_type ; +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > Policy ; + + const FunctorType m_functor ; + const Policy m_policy ; + + ParallelFor(); + ParallelFor & operator = ( const ParallelFor & ); + + template< class Tag > + inline static + __device__ + void driver( const FunctorType & functor + , typename Impl::enable_if< Impl::is_same< Tag , void >::value + , typename Policy::member_type const & >::type iwork + ) + { functor( iwork ); } + + template< class Tag > + inline static + __device__ + void driver( const FunctorType & functor + , typename Impl::enable_if< ! Impl::is_same< Tag , void >::value + , typename Policy::member_type const & >::type iwork + ) + { functor( Tag() , iwork ); } + +public: + + typedef FunctorType functor_type ; + + inline + __device__ + void operator()(void) const + { + const typename Policy::member_type work_stride = blockDim.y * gridDim.x ; + const typename Policy::member_type work_end = m_policy.end(); + + for ( typename Policy::member_type + iwork = m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x ; + iwork < work_end ; + iwork += work_stride ) { + ParallelFor::template driver< typename Policy::work_tag >( m_functor, iwork ); + } + } + + ParallelFor( const FunctorType & functor , + const Policy & policy ) + : m_functor( functor ) + , m_policy( policy ) + { + const dim3 block( 1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1); + const dim3 grid( std::min( ( int( policy.end() - policy.begin() ) + block.y - 1 ) / block.y + , cuda_internal_maximum_grid_count() ) + , 1 , 1); + + CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 ); + } +}; + +template< class FunctorType , class Arg0 , class Arg1 > +class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda > > +{ +private: + + typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda > Policy ; + +public: + + typedef FunctorType functor_type ; + typedef Cuda::size_type size_type ; + +private: + + // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1 + // shared memory utilization: + // + // [ team reduce space ] + // [ team shared space ] + // + + const FunctorType m_functor ; + size_type m_shmem_begin ; + size_type m_shmem_size ; + size_type m_league_size ; + + template< class TagType > + __device__ inline + void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member ) const + { m_functor( member ); } + + template< class TagType > + __device__ inline + void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member ) const + { m_functor( TagType() , member ); } + +public: + + __device__ inline + void operator()(void) const + { + // Iterate this block through the league + for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) { + + ParallelFor::template driver< typename Policy::work_tag >( + typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>() + , m_shmem_begin + , m_shmem_size + , league_rank + , m_league_size ) ); + } + } + + + ParallelFor( const FunctorType & functor + , const Policy & policy + ) + : m_functor( functor ) + , m_shmem_begin( sizeof(double) * ( policy.team_size() + 2 ) ) + , m_shmem_size( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) ) + , m_league_size( policy.league_size() ) + { + // Functor's reduce memory, team scan memory, and team shared memory depend upon team size. + + const int shmem_size_total = m_shmem_begin + m_shmem_size ; + + if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) { + Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); + } + + const dim3 grid( int(policy.league_size()) , 1 , 1 ); + const dim3 block( policy.vector_length() , policy.team_size() , 1 ); + + CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > > +{ +private: + + typedef Kokkos::RangePolicy<Arg0,Arg1,Arg2, Kokkos::Cuda > Policy ; + typedef typename Policy::WorkRange work_range ; + typedef typename Policy::work_tag work_tag ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ; + +public: + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::value_type value_type ; + typedef typename ValueTraits::reference_type reference_type ; + typedef FunctorType functor_type ; + typedef Cuda::size_type size_type ; + + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1 + + const FunctorType m_functor ; + const Policy m_policy ; + size_type * m_scratch_space ; + size_type * m_scratch_flags ; + size_type * m_unified_space ; + + // Determine block size constrained by shared memory: + static inline + unsigned local_block_size( const FunctorType & f ) + { + unsigned n = CudaTraits::WarpSize * 8 ; + while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( f , n ) ) { n >>= 1 ; } + return n ; + } + + template< class Tag > + inline static + __device__ + void driver( const FunctorType & functor + , typename Impl::enable_if< Impl::is_same< Tag , void >::value + , typename Policy::member_type const & >::type iwork + , reference_type value ) + { functor( iwork , value ); } + + template< class Tag > + inline static + __device__ + void driver( const FunctorType & functor + , typename Impl::enable_if< ! Impl::is_same< Tag , void >::value + , typename Policy::member_type const & >::type iwork + , reference_type value ) + { functor( Tag() , iwork , value ); } + +#ifndef KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION + __device__ inline + void operator()(void) const + { + const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) > + word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) ); + + { + reference_type value = + ValueInit::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value ); + + // Number of blocks is bounded so that the reduction can be limited to two passes. + // Each thread block is given an approximately equal amount of work to perform. + // Accumulate the values for this block. + // The accumulation ordering does not match the final pass, but is arithmatically equivalent. + + const work_range range( m_policy , blockIdx.x , gridDim.x ); + + for ( typename work_range::member_type iwork = range.begin() + threadIdx.y , iwork_end = range.end() ; + iwork < iwork_end ; iwork += blockDim.y ) { + ParallelReduce::template driver< work_tag >( m_functor , iwork , value ); + } + } + + // Reduce with final value at blockDim.y - 1 location. + if ( cuda_single_inter_block_reduce_scan<false,FunctorType,work_tag>( + m_functor , blockIdx.x , gridDim.x , + kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) { + + // This is the final block with the final result at the final threads' location + + size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ; + size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ; + + if ( threadIdx.y == 0 ) { + Kokkos::Impl::FunctorFinal< FunctorType , work_tag >::final( m_functor , shared ); + } + + if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } + + for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; } + } + } +#else + __device__ inline + void operator()(void) const + { + + value_type value = 0; + + // Number of blocks is bounded so that the reduction can be limited to two passes. + // Each thread block is given an approximately equal amount of work to perform. + // Accumulate the values for this block. + // The accumulation ordering does not match the final pass, but is arithmatically equivalent. + + const Policy range( m_policy , blockIdx.x , gridDim.x ); + + for ( typename Policy::member_type iwork = range.begin() + threadIdx.y , iwork_end = range.end() ; + iwork < iwork_end ; iwork += blockDim.y ) { + ParallelReduce::template driver< work_tag >( m_functor , iwork , value ); + } + + pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ; + int max_active_thread = range.end()-range.begin() < blockDim.y ? range.end() - range.begin():blockDim.y; + max_active_thread = max_active_thread == 0?blockDim.y:max_active_thread; + if(Impl::cuda_inter_block_reduction<FunctorType,Impl::JoinAdd<value_type> > + (value,Impl::JoinAdd<value_type>(),m_scratch_space,result,m_scratch_flags,max_active_thread)) { + const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; + if(id==0) { + Kokkos::Impl::FunctorFinal< FunctorType , work_tag >::final( m_functor , (void*) &value ); + *result = value; + } + } + } +#endif + template< class HostViewType > + ParallelReduce( const FunctorType & functor + , const Policy & policy + , const HostViewType & result + ) + : m_functor( functor ) + , m_policy( policy ) + , m_scratch_space( 0 ) + , m_scratch_flags( 0 ) + , m_unified_space( 0 ) + { + const int block_size = local_block_size( functor ); + const int block_count = std::min( int(block_size) + , ( int(policy.end() - policy.begin()) + block_size - 1 ) / block_size + ); + + m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( functor ) * block_count ); + m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) ); + m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( functor ) ); + + const dim3 grid( block_count , 1 , 1 ); + const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 ) +#ifdef KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION + const int shmem = 0; +#else + const int shmem = cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( m_functor , block.y ); +#endif + + CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute + + Cuda::fence(); + + if ( result.ptr_on_device() ) { + if ( m_unified_space ) { + const int count = ValueTraits::value_count( m_functor ); + for ( int i = 0 ; i < count ; ++i ) { result.ptr_on_device()[i] = pointer_type(m_unified_space)[i] ; } + } + else { + const int size = ValueTraits::value_size( m_functor ); + DeepCopy<HostSpace,CudaSpace>( result.ptr_on_device() , m_scratch_space , size ); + } + } + } +}; + +template< class FunctorType , class Arg0 , class Arg1 > +class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda > > +{ +private: + + typedef Kokkos::TeamPolicy<Arg0,Arg1,Kokkos::Cuda> Policy ; + typedef typename Policy::work_tag work_tag ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ; + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + +public: + + typedef FunctorType functor_type ; + typedef Cuda::size_type size_type ; + +private: + + // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1 + // shared memory utilization: + // + // [ global reduce space ] + // [ team reduce space ] + // [ team shared space ] + // + + const FunctorType m_functor ; + size_type * m_scratch_space ; + size_type * m_scratch_flags ; + size_type * m_unified_space ; + size_type m_team_begin ; + size_type m_shmem_begin ; + size_type m_shmem_size ; + size_type m_league_size ; + + template< class TagType > + __device__ inline + void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member + , reference_type update ) const + { m_functor( member , update ); } + + template< class TagType > + __device__ inline + void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member + , reference_type update ) const + { m_functor( TagType() , member , update ); } + +public: + + __device__ inline + void operator()(void) const + { + const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) > + word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) ); + + reference_type value = + ValueInit::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value ); + + // Iterate this block through the league + for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) { + + ParallelReduce::template driver< work_tag > + ( typename Policy::member_type( kokkos_impl_cuda_shared_memory<char>() + m_team_begin + , m_shmem_begin + , m_shmem_size + , league_rank + , m_league_size ) + , value ); + } + + // Reduce with final value at blockDim.y - 1 location. + if ( cuda_single_inter_block_reduce_scan<false,FunctorType,work_tag>( + m_functor , blockIdx.x , gridDim.x , + kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) { + + // This is the final block with the final result at the final threads' location + + size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ; + size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ; + + if ( threadIdx.y == 0 ) { + Kokkos::Impl::FunctorFinal< FunctorType , work_tag >::final( m_functor , shared ); + } + + if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } + + for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; } + } + } + + + template< class HostViewType > + ParallelReduce( const FunctorType & functor + , const Policy & policy + , const HostViewType & result + ) + : m_functor( functor ) + , m_scratch_space( 0 ) + , m_scratch_flags( 0 ) + , m_unified_space( 0 ) + , m_team_begin( cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( functor , policy.team_size() ) ) + , m_shmem_begin( sizeof(double) * ( policy.team_size() + 2 ) ) + , m_shmem_size( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) ) + , m_league_size( policy.league_size() ) + { + + // The global parallel_reduce does not support vector_length other than 1 at the moment + if(policy.vector_length() > 1) + Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA."); + + // Functor's reduce memory, team scan memory, and team shared memory depend upon team size. + + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ; + const int not_power_of_two = 0 != ( policy.team_size() & ( policy.team_size() - 1 ) ); + + if ( not_power_of_two || CudaTraits::SharedMemoryCapacity < shmem_size_total ) { + Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size")); + } + + const int block_count = std::min( policy.league_size() , policy.team_size() ); + + m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( functor ) * block_count ); + m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) ); + m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( functor ) ); + + const dim3 grid( block_count , 1 , 1 ); + const dim3 block( 1 , policy.team_size() , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 ) + + CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute + + Cuda::fence(); + + if ( result.ptr_on_device() ) { + if ( m_unified_space ) { + const int count = ValueTraits::value_count( m_functor ); + for ( int i = 0 ; i < count ; ++i ) { result.ptr_on_device()[i] = pointer_type(m_unified_space)[i] ; } + } + else { + const int size = ValueTraits::value_size( m_functor ); + DeepCopy<HostSpace,CudaSpace>( result.ptr_on_device() , m_scratch_space , size ); + } + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > > +{ +private: + + typedef Kokkos::RangePolicy<Arg0,Arg1,Arg2, Kokkos::Cuda > Policy ; + typedef typename Policy::WorkRange work_range ; + typedef typename Policy::work_tag work_tag ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ; + typedef Kokkos::Impl::FunctorValueOps< FunctorType , work_tag > ValueOps ; + +public: + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + typedef FunctorType functor_type ; + typedef Cuda::size_type size_type ; + + // Algorithmic constraints: + // (a) blockDim.y is a power of two + // (b) blockDim.y == blockDim.z == 1 + // (c) gridDim.x <= blockDim.y * blockDim.y + // (d) gridDim.y == gridDim.z == 1 + + // Determine block size constrained by shared memory: + static inline + unsigned local_block_size( const FunctorType & f ) + { + // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512 (16 warps) + // gridDim.x <= blockDim.y * blockDim.y + // + // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit testing + + unsigned n = CudaTraits::WarpSize * 4 ; + while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( f , n ) ) { n >>= 1 ; } + return n ; + } + + const FunctorType m_functor ; + const Policy m_policy ; + size_type * m_scratch_space ; + size_type * m_scratch_flags ; + size_type m_final ; + + template< class Tag > + inline static + __device__ + void driver( const FunctorType & functor + , typename Impl::enable_if< Impl::is_same< Tag , void >::value + , typename Policy::member_type const & >::type iwork + , reference_type value + , const bool final ) + { functor( iwork , value , final ); } + + template< class Tag > + inline static + __device__ + void driver( const FunctorType & functor + , typename Impl::enable_if< ! Impl::is_same< Tag , void >::value + , typename Policy::member_type const & >::type iwork + , reference_type value + , const bool final ) + { functor( Tag() , iwork , value , final ); } + + //---------------------------------------- + + __device__ inline + void initial(void) const + { + const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) > + word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) ); + + size_type * const shared_value = kokkos_impl_cuda_shared_memory<size_type>() + word_count.value * threadIdx.y ; + + ValueInit::init( m_functor , shared_value ); + + // Number of blocks is bounded so that the reduction can be limited to two passes. + // Each thread block is given an approximately equal amount of work to perform. + // Accumulate the values for this block. + // The accumulation ordering does not match the final pass, but is arithmatically equivalent. + + const work_range range( m_policy , blockIdx.x , gridDim.x ); + + for ( typename Policy::member_type iwork = range.begin() + threadIdx.y , iwork_end = range.end() ; + iwork < iwork_end ; iwork += blockDim.y ) { + ParallelScan::template driver< work_tag > + ( m_functor , iwork , ValueOps::reference( shared_value ) , false ); + } + + // Reduce and scan, writing out scan of blocks' totals and block-groups' totals. + // Blocks' scan values are written to 'blockIdx.x' location. + // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i < gridDim.x + cuda_single_inter_block_reduce_scan<true,FunctorType,work_tag>( m_functor , blockIdx.x , gridDim.x , kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ); + } + + //---------------------------------------- + + __device__ inline + void final(void) const + { + const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) > + word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) ); + + // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , value[2] , ... } + size_type * const shared_data = kokkos_impl_cuda_shared_memory<size_type>(); + size_type * const shared_prefix = shared_data + word_count.value * threadIdx.y ; + size_type * const shared_accum = shared_data + word_count.value * ( blockDim.y + 1 ); + + // Starting value for this thread block is the previous block's total. + if ( blockIdx.x ) { + size_type * const block_total = m_scratch_space + word_count.value * ( blockIdx.x - 1 ); + for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i] ; } + } + else if ( 0 == threadIdx.y ) { + ValueInit::init( m_functor , shared_accum ); + } + + const work_range range( m_policy , blockIdx.x , gridDim.x ); + + for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) { + + const typename Policy::member_type iwork = iwork_base + threadIdx.y ; + + __syncthreads(); // Don't overwrite previous iteration values until they are used + + ValueInit::init( m_functor , shared_prefix + word_count.value ); + + // Copy previous block's accumulation total into thread[0] prefix and inclusive scan value of this block + for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { + shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ; + } + + if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values. + + // Call functor to accumulate inclusive scan value for this work item + if ( iwork < range.end() ) { + ParallelScan::template driver< work_tag > + ( m_functor , iwork , ValueOps::reference( shared_prefix + word_count.value ) , false ); + } + + // Scan block values into locations shared_data[1..blockDim.y] + cuda_intra_block_reduce_scan<true,FunctorType,work_tag>( m_functor , ValueTraits::pointer_type(shared_data+word_count.value) ); + + { + size_type * const block_total = shared_data + word_count.value * blockDim.y ; + for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i]; } + } + + // Call functor with exclusive scan value + if ( iwork < range.end() ) { + ParallelScan::template driver< work_tag > + ( m_functor , iwork , ValueOps::reference( shared_prefix ) , true ); + } + } + } + + //---------------------------------------- + + __device__ inline + void operator()(void) const + { + if ( ! m_final ) { + initial(); + } + else { + final(); + } + } + + ParallelScan( const FunctorType & functor , + const Policy & policy ) + : m_functor( functor ) + , m_policy( policy ) + , m_scratch_space( 0 ) + , m_scratch_flags( 0 ) + , m_final( false ) + { + enum { GridMaxComputeCapability_2x = 0x0ffff }; + + const int block_size = local_block_size( functor ); + + const int grid_max = ( block_size * block_size ) < GridMaxComputeCapability_2x ? + ( block_size * block_size ) : GridMaxComputeCapability_2x ; + + // At most 'max_grid' blocks: + const int nwork = policy.end() - policy.begin(); + const int max_grid = std::min( int(grid_max) , int(( nwork + block_size - 1 ) / block_size )); + + // How much work per block: + const int work_per_block = ( nwork + max_grid - 1 ) / max_grid ; + + // How many block are really needed for this much work: + const dim3 grid( ( nwork + work_per_block - 1 ) / work_per_block , 1 , 1 ); + const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 ) + const int shmem = ValueTraits::value_size( functor ) * ( block_size + 2 ); + + m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( functor ) * grid.x ); + m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 ); + + m_final = false ; + CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute + + m_final = true ; + CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute + } + + void wait() const { Cuda::fence(); } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + template<typename iType> + struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> { + typedef iType index_type; + const iType start; + const iType end; + const iType increment; + const CudaTeamMember& thread; + +#ifdef __CUDA_ARCH__ + __device__ inline + TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count): + start( threadIdx.y ), + end( count ), + increment( blockDim.y ), + thread(thread_) + {} + __device__ inline + TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_): + start( begin_+threadIdx.y ), + end( end_ ), + increment( blockDim.y ), + thread(thread_) + {} +#else + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count): + start( 0 ), + end( count ), + increment( 1 ), + thread(thread_) + {} + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_): + start( begin_ ), + end( end_ ), + increment( 1 ), + thread(thread_) + {} +#endif + }; + + template<typename iType> + struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> { + typedef iType index_type; + const iType start; + const iType end; + const iType increment; + +#ifdef __CUDA_ARCH__ + __device__ inline + ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread, const iType& count): + start( threadIdx.x ), + end( count ), + increment( blockDim.x ) + {} +#else + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count): + start( 0 ), + end( count ), + increment( 1 ) + {} +#endif + }; + +} // namespace Impl + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> + TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& count) { + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,count); +} + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> + TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& begin, const iType& end) { + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,begin,end); +} + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember > + ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) { + return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count); +} + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct<Impl::CudaTeamMember> PerTeam(const Impl::CudaTeamMember& thread) { + return Impl::ThreadSingleStruct<Impl::CudaTeamMember>(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct<Impl::CudaTeamMember> PerThread(const Impl::CudaTeamMember& thread) { + return Impl::VectorSingleStruct<Impl::CudaTeamMember>(thread); +} + +} // namespace Kokkos + +namespace Kokkos { + + /** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + * This functionality requires C++11 support.*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries, const Lambda& lambda) { + #ifdef __CUDA_ARCH__ + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i); + #endif +} + +/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of + * val is performed and put into result. This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries, + const Lambda & lambda, ValueType& result) { + +#ifdef __CUDA_ARCH__ + result = ValueType(); + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,result); + } + + Impl::cuda_intra_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; }); + Impl::cuda_inter_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; }); + +#endif +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of + * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result. + * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore + * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or + * '1 for *'). This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries, + const Lambda & lambda, const JoinType& join, ValueType& init_result) { + +#ifdef __CUDA_ARCH__ + ValueType result = init_result; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,result); + } + + Impl::cuda_intra_warp_reduction(result, join ); + Impl::cuda_inter_warp_reduction(result, join ); + + init_result = result; +#endif +} + +} //namespace Kokkos + +namespace Kokkos { +/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. + * This functionality requires C++11 support.*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >& + loop_boundaries, const Lambda& lambda) { +#ifdef __CUDA_ARCH__ + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i); +#endif +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of + * val is performed and put into result. This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >& + loop_boundaries, const Lambda & lambda, ValueType& result) { +#ifdef __CUDA_ARCH__ + ValueType val = ValueType(); + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,val); + } + + result = val; + + if (loop_boundaries.increment > 1) + result += shfl_down(result, 1,loop_boundaries.increment); + if (loop_boundaries.increment > 2) + result += shfl_down(result, 2,loop_boundaries.increment); + if (loop_boundaries.increment > 4) + result += shfl_down(result, 4,loop_boundaries.increment); + if (loop_boundaries.increment > 8) + result += shfl_down(result, 8,loop_boundaries.increment); + if (loop_boundaries.increment > 16) + result += shfl_down(result, 16,loop_boundaries.increment); + + result = shfl(result,0,loop_boundaries.increment); +#endif +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of + * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result. + * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore + * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or + * '1 for *'). This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >& + loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) { + +#ifdef __CUDA_ARCH__ + ValueType result = init_result; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,result); + } + + if (loop_boundaries.increment > 1) + join( result, shfl_down(result, 1,loop_boundaries.increment)); + if (loop_boundaries.increment > 2) + join( result, shfl_down(result, 2,loop_boundaries.increment)); + if (loop_boundaries.increment > 4) + join( result, shfl_down(result, 4,loop_boundaries.increment)); + if (loop_boundaries.increment > 8) + join( result, shfl_down(result, 8,loop_boundaries.increment)); + if (loop_boundaries.increment > 16) + join( result, shfl_down(result, 16,loop_boundaries.increment)); + + init_result = shfl(result,0,loop_boundaries.increment); +#endif +} + +/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final) + * for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed. + * Depending on the target execution space the operator might be called twice: once with final=false + * and once with final=true. When final==true val contains the prefix sum value. The contribution of this + * "i" needs to be added to val no matter whether final==true or not. In a serial execution + * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set + * to the final sum value over all vector lanes. + * This functionality requires C++11 support.*/ +template< typename iType, class FunctorType > +KOKKOS_INLINE_FUNCTION +void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >& + loop_boundaries, const FunctorType & lambda) { + +#ifdef __CUDA_ARCH__ + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef typename ValueTraits::value_type value_type ; + + value_type scan_val = value_type(); + const int VectorLength = blockDim.x; + + iType loop_bound = ((loop_boundaries.end+VectorLength-1)/VectorLength) * VectorLength; + for(int _i = threadIdx.x; _i < loop_bound; _i += VectorLength) { + value_type val = value_type(); + if(_i<loop_boundaries.end) + lambda(_i , val , false); + + value_type tmp = val; + value_type result_i; + + if(threadIdx.x%VectorLength == 0) + result_i = tmp; + if (VectorLength > 1) { + const value_type tmp2 = shfl_up(tmp, 1,VectorLength); + if(threadIdx.x > 0) + tmp+=tmp2; + } + if(threadIdx.x%VectorLength == 1) + result_i = tmp; + if (VectorLength > 3) { + const value_type tmp2 = shfl_up(tmp, 2,VectorLength); + if(threadIdx.x > 1) + tmp+=tmp2; + } + if ((threadIdx.x%VectorLength >= 2) && + (threadIdx.x%VectorLength < 4)) + result_i = tmp; + if (VectorLength > 7) { + const value_type tmp2 = shfl_up(tmp, 4,VectorLength); + if(threadIdx.x > 3) + tmp+=tmp2; + } + if ((threadIdx.x%VectorLength >= 4) && + (threadIdx.x%VectorLength < 8)) + result_i = tmp; + if (VectorLength > 15) { + const value_type tmp2 = shfl_up(tmp, 8,VectorLength); + if(threadIdx.x > 7) + tmp+=tmp2; + } + if ((threadIdx.x%VectorLength >= 8) && + (threadIdx.x%VectorLength < 16)) + result_i = tmp; + if (VectorLength > 31) { + const value_type tmp2 = shfl_up(tmp, 16,VectorLength); + if(threadIdx.x > 15) + tmp+=tmp2; + } + if (threadIdx.x%VectorLength >= 16) + result_i = tmp; + + val = scan_val + result_i - val; + scan_val += shfl(tmp,VectorLength-1,VectorLength); + if(_i<loop_boundaries.end) + lambda(_i , val , true); + } +#endif +} + +} + +namespace Kokkos { + +template<class FunctorType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) { +#ifdef __CUDA_ARCH__ + if(threadIdx.x == 0) lambda(); +#endif +} + +template<class FunctorType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) { +#ifdef __CUDA_ARCH__ + if(threadIdx.x == 0 && threadIdx.y == 0) lambda(); +#endif +} + +template<class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda, ValueType& val) { +#ifdef __CUDA_ARCH__ + if(threadIdx.x == 0) lambda(val); + val = shfl(val,0,blockDim.x); +#endif +} + +template<class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) { +#ifdef __CUDA_ARCH__ + if(threadIdx.x == 0 && threadIdx.y == 0) { + lambda(val); + } + single_struct.team_member.team_broadcast(val,0); +#endif +} + +} + +namespace Kokkos { + +namespace Impl { + template< class FunctorType, class ExecPolicy, class ValueType , class Tag = typename ExecPolicy::work_tag> + struct CudaFunctorAdapter { + const FunctorType f; + typedef ValueType value_type; + CudaFunctorAdapter(const FunctorType& f_):f(f_) {} + + __device__ inline + void operator() (typename ExecPolicy::work_tag, const typename ExecPolicy::member_type& i, ValueType& val) const { + //Insert Static Assert with decltype on ValueType equals third argument type of FunctorType::operator() + f(typename ExecPolicy::work_tag(), i,val); + } + }; + + template< class FunctorType, class ExecPolicy, class ValueType > + struct CudaFunctorAdapter<FunctorType,ExecPolicy,ValueType,void> { + const FunctorType f; + typedef ValueType value_type; + CudaFunctorAdapter(const FunctorType& f_):f(f_) {} + + __device__ inline + void operator() (const typename ExecPolicy::member_type& i, ValueType& val) const { + //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator() + f(i,val); + } + + }; + + template< class FunctorType, class Enable = void> + struct ReduceFunctorHasInit { + enum {value = false}; + }; + + template< class FunctorType> + struct ReduceFunctorHasInit<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type > { + enum {value = true}; + }; + + template< class FunctorType, class Enable = void> + struct ReduceFunctorHasJoin { + enum {value = false}; + }; + + template< class FunctorType> + struct ReduceFunctorHasJoin<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type > { + enum {value = true}; + }; + + template< class FunctorType, class Enable = void> + struct ReduceFunctorHasFinal { + enum {value = false}; + }; + + template< class FunctorType> + struct ReduceFunctorHasFinal<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type > { + enum {value = true}; + }; + + template< class FunctorType, bool Enable = + ( FunctorDeclaresValueType<FunctorType,void>::value) || + ( ReduceFunctorHasInit<FunctorType>::value ) || + ( ReduceFunctorHasJoin<FunctorType>::value ) || + ( ReduceFunctorHasFinal<FunctorType>::value ) + > + struct IsNonTrivialReduceFunctor { + enum {value = false}; + }; + + template< class FunctorType> + struct IsNonTrivialReduceFunctor<FunctorType, true> { + enum {value = true}; + }; + + template<class FunctorType, class ResultType, class Tag, bool Enable = IsNonTrivialReduceFunctor<FunctorType>::value > + struct FunctorReferenceType { + typedef ResultType& reference_type; + }; + + template<class FunctorType, class ResultType, class Tag> + struct FunctorReferenceType<FunctorType, ResultType, Tag, true> { + typedef typename Kokkos::Impl::FunctorValueTraits< FunctorType ,Tag >::reference_type reference_type; + }; + +} + +// general policy and view ouput +template< class ExecPolicy , class FunctorTypeIn , class ViewType > +inline +void parallel_reduce( const ExecPolicy & policy + , const FunctorTypeIn & functor_in + , const ViewType & result_view + , const std::string& str = "" + , typename Impl::enable_if< + ( Impl::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value && + Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value + )>::type * = 0 ) +{ + enum {FunctorHasValueType = Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value }; + typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,typename ViewType::value_type> >::type FunctorType; + FunctorType functor = Impl::if_c<FunctorHasValueType,FunctorTypeIn,FunctorType>::select(functor_in,FunctorType(functor_in)); + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , policy , result_view ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelScan(kpID); + } +#endif +} + +// general policy and pod or array of pod output +template< class ExecPolicy , class FunctorTypeIn , class ResultType> +inline +void parallel_reduce( const ExecPolicy & policy + , const FunctorTypeIn & functor_in + , ResultType& result_ref + , const std::string& str = "" + , typename Impl::enable_if< + ( ! Impl::is_view<ResultType>::value && + ! Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value && + ! Impl::is_integral< ExecPolicy >::value && + Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )>::type * = 0 ) +{ + typedef typename Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ResultType> FunctorType; + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ; + + // Wrap the result output request in a view to inform the implementation + // of the type and memory space. + + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + Kokkos::View< value_type + , HostSpace + , Kokkos::MemoryUnmanaged + > + result_view( ValueOps::pointer( result_ref ) + , 1 + ); + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( FunctorType(functor_in) , policy , result_view ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelScan(kpID); + } +#endif +} + +// general policy and pod or array of pod output +template< class ExecPolicy , class FunctorType> +inline +void parallel_reduce( const ExecPolicy & policy + , const FunctorType & functor + , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type result_ref + , const std::string& str = "" + , typename Impl::enable_if< + ( Impl::IsNonTrivialReduceFunctor<FunctorType>::value && + ! Impl::is_integral< ExecPolicy >::value && + Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )>::type * = 0 ) +{ + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ; + + // Wrap the result output request in a view to inform the implementation + // of the type and memory space. + + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + + Kokkos::View< value_type + , HostSpace + , Kokkos::MemoryUnmanaged + > + result_view( ValueOps::pointer( result_ref ) + , ValueTraits::value_count( functor ) + ); + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , policy , result_view ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelScan(kpID); + } +#endif +} + +// integral range policy and view ouput +template< class FunctorTypeIn , class ViewType > +inline +void parallel_reduce( const size_t work_count + , const FunctorTypeIn & functor_in + , const ViewType & result_view + , const std::string& str = "" + , typename Impl::enable_if<( Impl::is_view<ViewType>::value && + Impl::is_same< + typename Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space, + Kokkos::Cuda>::value + )>::type * = 0 ) +{ + enum {FunctorHasValueType = Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value }; + typedef typename + Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space + execution_space ; + + typedef RangePolicy< execution_space > ExecPolicy ; + + typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,typename ViewType::value_type> >::type FunctorType; + + FunctorType functor = Impl::if_c<FunctorHasValueType,FunctorTypeIn,FunctorType>::select(functor_in,FunctorType(functor_in)); + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , ExecPolicy(0,work_count) , result_view ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelScan(kpID); + } +#endif + +} + +// integral range policy and pod or array of pod output +template< class FunctorTypeIn , class ResultType> +inline +void parallel_reduce( const size_t work_count + , const FunctorTypeIn & functor_in + , ResultType& result + , const std::string& str = "" + , typename Impl::enable_if< ! Impl::is_view<ResultType>::value && + ! Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value && + Impl::is_same< + typename Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space, + Kokkos::Cuda>::value >::type * = 0 ) +{ + typedef typename + Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space + execution_space ; + typedef Kokkos::RangePolicy< execution_space > ExecPolicy ; + + typedef Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ResultType> FunctorType; + + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ; + + + // Wrap the result output request in a view to inform the implementation + // of the type and memory space. + + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + + Kokkos::View< value_type + , HostSpace + , Kokkos::MemoryUnmanaged + > + result_view( ValueOps::pointer( result ) + , 1 + ); + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType , ExecPolicy >( FunctorType(functor_in) , ExecPolicy(0,work_count) , result_view ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelScan(kpID); + } +#endif +} + +template< class FunctorType> +inline +void parallel_reduce( const size_t work_count + , const FunctorType & functor + , typename Kokkos::Impl::FunctorValueTraits< FunctorType , void >::reference_type result + , const std::string& str = "" + , typename Impl::enable_if< Impl::IsNonTrivialReduceFunctor<FunctorType>::value && + Impl::is_same< + typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space, + Kokkos::Cuda>::value >::type * = 0 ) +{ + + typedef typename + Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space + execution_space ; + typedef Kokkos::RangePolicy< execution_space > ExecPolicy ; + + + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ; + + + // Wrap the result output request in a view to inform the implementation + // of the type and memory space. + + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + + Kokkos::View< value_type + , HostSpace + , Kokkos::MemoryUnmanaged + > + result_view( ValueOps::pointer( result ) + , ValueTraits::value_count( functor ) + ); + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType , ExecPolicy >( functor , ExecPolicy(0,work_count) , result_view ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelScan(kpID); + } +#endif +} + +} // namespace Kokkos +#endif /* defined( __CUDACC__ ) */ + +#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */ + diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp new file mode 100755 index 0000000000000000000000000000000000000000..5ef16711eecb006103f32e65d84bd3d310be2719 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -0,0 +1,424 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_REDUCESCAN_HPP +#define KOKKOS_CUDA_REDUCESCAN_HPP + +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA ) + +#include <utility> + +#include <Kokkos_Parallel.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> +#include <impl/Kokkos_Error.hpp> +#include <Cuda/Kokkos_Cuda_Vectorization.hpp> +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + + + +//Shfl based reductions +/* + * Algorithmic constraints: + * (a) threads with same threadIdx.y have same value + * (b) blockDim.x == power of two + * (c) blockDim.z == 1 + */ + +template< class ValueType , class JoinOp> +__device__ +inline void cuda_intra_warp_reduction( ValueType& result, + const JoinOp& join, + const int max_active_thread = blockDim.y) { + + unsigned int shift = 1; + + //Reduce over values from threads with different threadIdx.y + while(blockDim.x * shift < 32 ) { + const ValueType tmp = shfl_down(result, blockDim.x*shift,32u); + //Only join if upper thread is active (this allows non power of two for blockDim.y + if(threadIdx.y + shift < max_active_thread) + join(result , tmp); + shift*=2; + } + + result = shfl(result,0,32); +} + +template< class ValueType , class JoinOp> +__device__ +inline void cuda_inter_warp_reduction( ValueType& value, + const JoinOp& join, + const int max_active_thread = blockDim.y) { + + #define STEP_WIDTH 4 + __shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH]; + ValueType* result = (ValueType*) & sh_result; + const unsigned step = 32 / blockDim.x; + unsigned shift = STEP_WIDTH; + const int id = threadIdx.y%step==0?threadIdx.y/step:65000; + if(id < STEP_WIDTH ) { + result[id] = value; + } + __syncthreads(); + while (shift<=max_active_thread/step) { + if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) { + join(result[id%STEP_WIDTH],value); + } + __syncthreads(); + shift+=STEP_WIDTH; + } + + + value = result[0]; + for(int i = 1; (i*step<=max_active_thread) && i<STEP_WIDTH; i++) + join(value,result[i]); +} + +template< class ValueType , class JoinOp> +__device__ +inline void cuda_intra_block_reduction( ValueType& value, + const JoinOp& join, + const int max_active_thread = blockDim.y) { + cuda_intra_warp_reduction(value,join,max_active_thread); + cuda_inter_warp_reduction(value,join,max_active_thread); +} + +template< class FunctorType , class JoinOp> +__device__ +bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type value, + const JoinOp& join, + Cuda::size_type * const m_scratch_space, + typename FunctorValueTraits< FunctorType , void >::pointer_type const result, + Cuda::size_type * const m_scratch_flags, + const int max_active_thread = blockDim.y) { + typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type; + typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type; + + //Do the intra-block reduction with shfl operations and static shared memory + cuda_intra_block_reduction(value,join,max_active_thread); + + const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; + + //One thread in the block writes block result to global scratch_memory + if(id == 0 ) { + pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x; + *global = value; + } + + //One warp of last block performs inter block reduction through loading the block values from global scratch_memory + bool last_block = false; + + __syncthreads(); + if ( id < 32 ) { + Cuda::size_type count; + + //Figure out whether this is the last block + if(id == 0) + count = Kokkos::atomic_fetch_add(m_scratch_flags,1); + count = Kokkos::shfl(count,0,32); + + //Last block does the inter block reduction + if( count == gridDim.x - 1) { + //set flag back to zero + if(id == 0) + *m_scratch_flags = 0; + last_block = true; + value = 0; + + pointer_type const volatile global = (pointer_type) m_scratch_space ; + + //Reduce all global values with splitting work over threads in one warp + const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32; + for(int i=id; i<gridDim.x; i+=step_size) { + value_type tmp = global[i]; + join(value, tmp); + } + + //Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32) + if (blockDim.x*blockDim.y > 1) { + value_type tmp = Kokkos::shfl_down(value, 1,32); + if( id + 1 < gridDim.x ) + join(value, tmp); + } + if (blockDim.x*blockDim.y > 2) { + value_type tmp = Kokkos::shfl_down(value, 2,32); + if( id + 2 < gridDim.x ) + join(value, tmp); + } + if (blockDim.x*blockDim.y > 4) { + value_type tmp = Kokkos::shfl_down(value, 4,32); + if( id + 4 < gridDim.x ) + join(value, tmp); + } + if (blockDim.x*blockDim.y > 8) { + value_type tmp = Kokkos::shfl_down(value, 8,32); + if( id + 8 < gridDim.x ) + join(value, tmp); + } + if (blockDim.x*blockDim.y > 16) { + value_type tmp = Kokkos::shfl_down(value, 16,32); + if( id + 16 < gridDim.x ) + join(value, tmp); + } + } + } + + //The last block has in its thread=0 the global reduction value through "value" + return last_block; +} + +//---------------------------------------------------------------------------- +// See section B.17 of Cuda C Programming Guide Version 3.2 +// for discussion of +// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) +// function qualifier which could be used to improve performance. +//---------------------------------------------------------------------------- +// Maximize shared memory and minimize L1 cache: +// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared ); +// For 2.0 capability: 48 KB shared and 16 KB L1 +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +/* + * Algorithmic constraints: + * (a) blockDim.y is a power of two + * (b) blockDim.y <= 512 + * (c) blockDim.x == blockDim.z == 1 + */ + +template< bool DoScan , class FunctorType , class ArgTag > +__device__ +void cuda_intra_block_reduce_scan( const FunctorType & functor , + const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data ) +{ + typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ; + typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ; + + typedef typename ValueTraits::pointer_type pointer_type ; + + const unsigned value_count = ValueTraits::value_count( functor ); + const unsigned BlockSizeMask = blockDim.y - 1 ; + + // Must have power of two thread count + + if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); } + +#define BLOCK_REDUCE_STEP( R , TD , S ) \ + if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); } + +#define BLOCK_SCAN_STEP( TD , N , S ) \ + if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); } + + const unsigned rtid_intra = threadIdx.y ^ BlockSizeMask ; + const pointer_type tdata_intra = base_data + value_count * threadIdx.y ; + + { // Intra-warp reduction: + BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0) + BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1) + BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2) + BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3) + BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4) + } + + __syncthreads(); // Wait for all warps to reduce + + { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations + const unsigned rtid_inter = ( threadIdx.y ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ; + + if ( rtid_inter < blockDim.y ) { + + const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask ); + + if ( (1<<5) < BlockSizeMask ) { BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) } + if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) } + if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) } + if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) } + + if ( DoScan ) { + + int n = ( rtid_inter & 32 ) ? 32 : ( + ( rtid_inter & 64 ) ? 64 : ( + ( rtid_inter & 128 ) ? 128 : ( + ( rtid_inter & 256 ) ? 256 : 0 ))); + + if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ; + + BLOCK_SCAN_STEP(tdata_inter,n,8) + BLOCK_SCAN_STEP(tdata_inter,n,7) + BLOCK_SCAN_STEP(tdata_inter,n,6) + BLOCK_SCAN_STEP(tdata_inter,n,5) + } + } + } + + __syncthreads(); // Wait for inter-warp reduce-scan to complete + + if ( DoScan ) { + int n = ( rtid_intra & 1 ) ? 1 : ( + ( rtid_intra & 2 ) ? 2 : ( + ( rtid_intra & 4 ) ? 4 : ( + ( rtid_intra & 8 ) ? 8 : ( + ( rtid_intra & 16 ) ? 16 : 0 )))); + + if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ; + + BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block(); + BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block(); + BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block(); + BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block(); + BLOCK_SCAN_STEP(tdata_intra,n,0) + } + +#undef BLOCK_SCAN_STEP +#undef BLOCK_REDUCE_STEP +} + +//---------------------------------------------------------------------------- +/**\brief Input value-per-thread starting at 'shared_data'. + * Reduction value at last thread's location. + * + * If 'DoScan' then write blocks' scan values and block-groups' scan values. + * + * Global reduce result is in the last threads' 'shared_data' location. + */ +template< bool DoScan , class FunctorType , class ArgTag > +__device__ +bool cuda_single_inter_block_reduce_scan( const FunctorType & functor , + const Cuda::size_type block_id , + const Cuda::size_type block_count , + Cuda::size_type * const shared_data , + Cuda::size_type * const global_data , + Cuda::size_type * const global_flags ) +{ + typedef Cuda::size_type size_type ; + typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ; + typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ; + typedef FunctorValueInit< FunctorType , ArgTag > ValueInit ; + typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + const unsigned BlockSizeMask = blockDim.y - 1 ; + const unsigned BlockSizeShift = power_of_two_if_valid( blockDim.y ); + + // Must have power of two thread count + if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_single_inter_block_reduce_scan requires power-of-two blockDim"); } + + const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) > + word_count( ValueTraits::value_size( functor ) / sizeof(size_type) ); + + // Reduce the accumulation for the entire block. + cuda_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) ); + + { + // Write accumulation total to global scratch space. + // Accumulation total is the last thread's data. + size_type * const shared = shared_data + word_count.value * BlockSizeMask ; + size_type * const global = global_data + word_count.value * block_id ; + + for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; } + } + + // Contributing blocks note that their contribution has been completed via an atomic-increment flag + // If this block is not the last block to contribute to this group then the block is done. + const bool is_last_block = + ! __syncthreads_or( threadIdx.y ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) ); + + if ( is_last_block ) { + + const size_type b = ( long(block_count) * long(threadIdx.y) ) >> BlockSizeShift ; + const size_type e = ( long(block_count) * long( threadIdx.y + 1 ) ) >> BlockSizeShift ; + + { + void * const shared_ptr = shared_data + word_count.value * threadIdx.y ; + reference_type shared_value = ValueInit::init( functor , shared_ptr ); + + for ( size_type i = b ; i < e ; ++i ) { + ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i ); + } + } + + cuda_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) ); + + if ( DoScan ) { + + size_type * const shared_value = shared_data + word_count.value * ( threadIdx.y ? threadIdx.y - 1 : blockDim.y ); + + if ( ! threadIdx.y ) { ValueInit::init( functor , shared_value ); } + + // Join previous inclusive scan value to each member + for ( size_type i = b ; i < e ; ++i ) { + size_type * const global_value = global_data + word_count.value * i ; + ValueJoin::join( functor , shared_value , global_value ); + ValueOps ::copy( functor , global_value , shared_value ); + } + } + } + + return is_last_block ; +} + +// Size in bytes required for inter block reduce or scan +template< bool DoScan , class FunctorType , class ArgTag > +inline +unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize ) +{ + return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor ); +} + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( __CUDACC__ ) */ +#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */ + diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp new file mode 100755 index 0000000000000000000000000000000000000000..0b8427cbe1e9664a41b6bb8b33b21320ad613d78 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp @@ -0,0 +1,298 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOS_CUDA_VECTORIZATION_HPP +#define KOKKOS_CUDA_VECTORIZATION_HPP + +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#ifdef KOKKOS_HAVE_CUDA + +#include <Kokkos_Cuda.hpp> + +namespace Kokkos { + + +// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs +// or other GPUs. We provide a generic definition (which is trivial +// and doesn't do what it claims to do) because we don't actually use +// this function unless we are on a suitable GPU, with a suitable +// Scalar type. (For example, in the mat-vec, the "ThreadsPerRow" +// internal parameter depends both on the ExecutionSpace and the Scalar type, +// and it controls whether shfl_down() gets called.) +namespace Impl { + + template< typename Scalar > + struct shfl_union { + enum {n = sizeof(Scalar)/4}; + float fval[n]; + KOKKOS_INLINE_FUNCTION + Scalar value() { + return *(Scalar*) fval; + } + KOKKOS_INLINE_FUNCTION + void operator= (Scalar& value_) { + float* const val_ptr = (float*) &value_; + for(int i=0; i<n ; i++) { + fval[i] = val_ptr[i]; + } + } + KOKKOS_INLINE_FUNCTION + void operator= (const Scalar& value_) { + float* const val_ptr = (float*) &value_; + for(int i=0; i<n ; i++) { + fval[i] = val_ptr[i]; + } + } + + }; +} + +#ifdef __CUDA_ARCH__ + #if (__CUDA_ARCH__ >= 300) + + KOKKOS_INLINE_FUNCTION + int shfl(const int &val, const int& srcLane, const int& width ) { + return __shfl(val,srcLane,width); + } + + KOKKOS_INLINE_FUNCTION + float shfl(const float &val, const int& srcLane, const int& width ) { + return __shfl(val,srcLane,width); + } + + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width + ) { + Scalar tmp1 = val; + float tmp = *reinterpret_cast<float*>(&tmp1); + tmp = __shfl(tmp,srcLane,width); + return *reinterpret_cast<Scalar*>(&tmp); + } + + KOKKOS_INLINE_FUNCTION + double shfl(const double &val, const int& srcLane, const int& width) { + int lo = __double2loint(val); + int hi = __double2hiint(val); + lo = __shfl(lo,srcLane,width); + hi = __shfl(hi,srcLane,width); + return __hiloint2double(hi,lo); + } + + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) { + int lo = __double2loint(*reinterpret_cast<const double*>(&val)); + int hi = __double2hiint(*reinterpret_cast<const double*>(&val)); + lo = __shfl(lo,srcLane,width); + hi = __shfl(hi,srcLane,width); + const double tmp = __hiloint2double(hi,lo); + return *(reinterpret_cast<const Scalar*>(&tmp)); + } + + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) { + Impl::shfl_union<Scalar> s_val; + Impl::shfl_union<Scalar> r_val; + s_val = val; + + for(int i = 0; i<s_val.n; i++) + r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width); + return r_val.value(); + } + + KOKKOS_INLINE_FUNCTION + int shfl_down(const int &val, const int& delta, const int& width) { + return __shfl_down(val,delta,width); + } + + KOKKOS_INLINE_FUNCTION + float shfl_down(const float &val, const int& delta, const int& width) { + return __shfl_down(val,delta,width); + } + + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) { + Scalar tmp1 = val; + float tmp = *reinterpret_cast<float*>(&tmp1); + tmp = __shfl_down(tmp,delta,width); + return *reinterpret_cast<Scalar*>(&tmp); + } + + KOKKOS_INLINE_FUNCTION + double shfl_down(const double &val, const int& delta, const int& width) { + int lo = __double2loint(val); + int hi = __double2hiint(val); + lo = __shfl_down(lo,delta,width); + hi = __shfl_down(hi,delta,width); + return __hiloint2double(hi,lo); + } + + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) { + int lo = __double2loint(*reinterpret_cast<const double*>(&val)); + int hi = __double2hiint(*reinterpret_cast<const double*>(&val)); + lo = __shfl_down(lo,delta,width); + hi = __shfl_down(hi,delta,width); + const double tmp = __hiloint2double(hi,lo); + return *(reinterpret_cast<const Scalar*>(&tmp)); + } + + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) { + Impl::shfl_union<Scalar> s_val; + Impl::shfl_union<Scalar> r_val; + s_val = val; + + for(int i = 0; i<s_val.n; i++) + r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width); + return r_val.value(); + } + + KOKKOS_INLINE_FUNCTION + int shfl_up(const int &val, const int& delta, const int& width ) { + return __shfl_up(val,delta,width); + } + + KOKKOS_INLINE_FUNCTION + float shfl_up(const float &val, const int& delta, const int& width ) { + return __shfl_up(val,delta,width); + } + + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) { + Scalar tmp1 = val; + float tmp = *reinterpret_cast<float*>(&tmp1); + tmp = __shfl_up(tmp,delta,width); + return *reinterpret_cast<Scalar*>(&tmp); + } + + KOKKOS_INLINE_FUNCTION + double shfl_up(const double &val, const int& delta, const int& width ) { + int lo = __double2loint(val); + int hi = __double2hiint(val); + lo = __shfl_up(lo,delta,width); + hi = __shfl_up(hi,delta,width); + return __hiloint2double(hi,lo); + } + + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) { + int lo = __double2loint(*reinterpret_cast<const double*>(&val)); + int hi = __double2hiint(*reinterpret_cast<const double*>(&val)); + lo = __shfl_up(lo,delta,width); + hi = __shfl_up(hi,delta,width); + const double tmp = __hiloint2double(hi,lo); + return *(reinterpret_cast<const Scalar*>(&tmp)); + } + + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) { + Impl::shfl_union<Scalar> s_val; + Impl::shfl_union<Scalar> r_val; + s_val = val; + + for(int i = 0; i<s_val.n; i++) + r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width); + return r_val.value(); + } + + #else + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl(const Scalar &val, const int& srcLane, const int& width) { + if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0."); + return val; + } + + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl_down(const Scalar &val, const int& delta, const int& width) { + if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0."); + return val; + } + + template<typename Scalar> + KOKKOS_INLINE_FUNCTION + Scalar shfl_up(const Scalar &val, const int& delta, const int& width) { + if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0."); + return val; + } + #endif +#else + template<typename Scalar> + inline + Scalar shfl(const Scalar &val, const int& srcLane, const int& width) { + if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0."); + return val; + } + + template<typename Scalar> + inline + Scalar shfl_down(const Scalar &val, const int& delta, const int& width) { + if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0."); + return val; + } + + template<typename Scalar> + inline + Scalar shfl_up(const Scalar &val, const int& delta, const int& width) { + if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0."); + return val; + } +#endif + + + +} + +#endif // KOKKOS_HAVE_CUDA +#endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp new file mode 100755 index 0000000000000000000000000000000000000000..a78ead0cbace7b5a8a76d80ae905c7311bcecb26 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp @@ -0,0 +1,312 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_VIEW_HPP +#define KOKKOS_CUDA_VIEW_HPP + +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#ifdef KOKKOS_HAVE_CUDA + +#include <cstring> + +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_CudaSpace.hpp> +#include <Kokkos_View.hpp> + +#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template<> +struct AssertShapeBoundsAbort< CudaSpace > +{ + KOKKOS_INLINE_FUNCTION + static void apply( const size_t /* rank */ , + const size_t /* n0 */ , const size_t /* n1 */ , + const size_t /* n2 */ , const size_t /* n3 */ , + const size_t /* n4 */ , const size_t /* n5 */ , + const size_t /* n6 */ , const size_t /* n7 */ , + + const size_t /* arg_rank */ , + const size_t /* i0 */ , const size_t /* i1 */ , + const size_t /* i2 */ , const size_t /* i3 */ , + const size_t /* i4 */ , const size_t /* i5 */ , + const size_t /* i6 */ , const size_t /* i7 */ ) + { + Kokkos::abort("Kokkos::View array bounds violation"); + } +}; + +} +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4) +// Via reinterpret_case this can be used to support all scalar types of those sizes. +// Any other scalar type falls back to either normal reads out of global memory, +// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0) + +template< typename ValueType + , class MemorySpace + , class AliasType = + typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 4 ) , int , + typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 8 ) , ::int2 , + typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , ::int4 , void + >::type + >::type + >::type + > +class CudaTextureFetch { +private: + + cuda_texture_object_type m_obj ; + const ValueType * m_alloc_ptr ; + int m_offset ; + + void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker ) + { + typedef char const * const byte; + + m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr()); + + size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr); + const bool ok_aligned = 0 == byte_offset % sizeof(ValueType); + + const size_t count = tracker.alloc_size() / sizeof(ValueType); + const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count)); + + if (ok_aligned && ok_contains) { + if (tracker.attribute() == NULL ) { + MemorySpace::texture_object_attach( + tracker + , sizeof(ValueType) + , cudaCreateChannelDesc< AliasType >() + ); + } + m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj; + m_offset = arg_ptr - m_alloc_ptr; + } + else if( !ok_contains ) { + throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer."); + } + else { + throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer."); + } + } + +public: + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {} + + KOKKOS_INLINE_FUNCTION + ~CudaTextureFetch() {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch( const CudaTextureFetch & rhs ) + : m_obj( rhs.m_obj ) + , m_alloc_ptr( rhs.m_alloc_ptr ) + , m_offset( rhs.m_offset ) + {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) + { + m_obj = rhs.m_obj ; + m_alloc_ptr = rhs.m_alloc_ptr ; + m_offset = rhs.m_offset ; + return *this ; + } + + KOKKOS_INLINE_FUNCTION explicit + CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker ) + : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0) + { + #if defined( KOKKOS_USE_LDG_INTRINSIC ) + m_alloc_ptr(arg_ptr); + #elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ ) + if ( arg_ptr != NULL ) { + if ( tracker.is_valid() ) { + attach( arg_ptr, tracker ); + } + else { + AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr); + if ( found_tracker.is_valid() ) { + attach( arg_ptr, found_tracker ); + } else { + throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!"); + } + } + } + #endif + } + + KOKKOS_INLINE_FUNCTION + operator const ValueType * () const { return m_alloc_ptr + m_offset ; } + + + template< typename iType > + KOKKOS_INLINE_FUNCTION + ValueType operator[]( const iType & i ) const + { + #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) + AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i])); + return *(reinterpret_cast<ValueType*> (&v)); + #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) + AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset ); + return *(reinterpret_cast<ValueType*> (&v)); + #else + return m_alloc_ptr[ i + m_offset ]; + #endif + } +}; + +template< typename ValueType, class MemorySpace > +class CudaTextureFetch< const ValueType, MemorySpace, void > +{ +private: + const ValueType * m_ptr ; +public: + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch() : m_ptr(0) {}; + + KOKKOS_INLINE_FUNCTION + ~CudaTextureFetch() { + } + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch( const ValueType * ptr, const AllocationTracker & ) : m_ptr(ptr) {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) { + m_ptr = rhs.m_ptr; + return *this ; + } + + explicit KOKKOS_INLINE_FUNCTION + CudaTextureFetch( ValueType * const base_view_ptr, AllocationTracker const & /*tracker*/ ) { + m_ptr = base_view_ptr; + } + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch & operator = (const ValueType* base_view_ptr) { + m_ptr = base_view_ptr; + return *this; + } + + + KOKKOS_INLINE_FUNCTION + operator const ValueType * () const { return m_ptr ; } + + + template< typename iType > + KOKKOS_INLINE_FUNCTION + ValueType operator[]( const iType & i ) const + { + return m_ptr[ i ]; + } +}; + +} // namespace Impl +} // namespace Kokkos + + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization + * if 'const' value type, CudaSpace and random access. + */ +template< class ViewTraits > +class ViewDataHandle< ViewTraits , + typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value || + is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value ) + && + is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value + && + ViewTraits::memory_traits::RandomAccess + >::type > +{ +public: + enum { ReturnTypeIsReference = false }; + + typedef Impl::CudaTextureFetch< typename ViewTraits::value_type + , typename ViewTraits::memory_space> handle_type; + + KOKKOS_INLINE_FUNCTION + static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & arg_tracker ) + { + return handle_type(arg_data_ptr, arg_tracker); + } + + typedef typename ViewTraits::value_type return_type; +}; + +} +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif // KOKKOS_HAVE_CUDA +#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */ + diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp new file mode 100755 index 0000000000000000000000000000000000000000..deb955ccd4755d43a24469171f2689d8c2a87dae --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp @@ -0,0 +1,119 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_ABORT_HPP +#define KOKKOS_CUDA_ABORT_HPP + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +#include "Kokkos_Macros.hpp" +#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA ) + +#include <cuda.h> + +#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 ) +#error "Cuda version 4.1 or greater required" +#endif + +#if ( __CUDA_ARCH__ < 200 ) +#error "Cuda device capability 2.0 or greater required" +#endif + +extern "C" { +/* Cuda runtime function, declared in <crt/device_runtime.h> + * Requires capability 2.x or better. + */ +extern __device__ void __assertfail( + const void *message, + const void *file, + unsigned int line, + const void *function, + size_t charsize); +} + +namespace Kokkos { +namespace Impl { + +__device__ inline +void cuda_abort( const char * const message ) +{ +#ifndef __APPLE__ + const char empty[] = "" ; + + __assertfail( (const void *) message , + (const void *) empty , + (unsigned int) 0 , + (const void *) empty , + sizeof(char) ); +#endif +} + +} // namespace Impl +} // namespace Kokkos + +#else + +namespace Kokkos { +namespace Impl { +KOKKOS_INLINE_FUNCTION +void cuda_abort( const char * const ) {} +} +} + +#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) +namespace Kokkos { +__device__ inline +void abort( const char * const message ) { Kokkos::Impl::cuda_abort(message); } +} +#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */ + diff --git a/lib/kokkos/core/src/KokkosExp_View.hpp b/lib/kokkos/core/src/KokkosExp_View.hpp new file mode 100755 index 0000000000000000000000000000000000000000..a2226f3de0562cacc88311ac001bf4c9b5d710fc --- /dev/null +++ b/lib/kokkos/core/src/KokkosExp_View.hpp @@ -0,0 +1,1945 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_VIEW_HPP +#define KOKKOS_EXPERIMENTAL_VIEW_HPP + +#include <string> +#include <type_traits> +#include <initializer_list> + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_MemoryTraits.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template< class > struct ViewDataAnalysis ; + +template< class , class = void , typename Enable = void > +class ViewMapping { enum { is_assignable = false }; }; + +template< class DstMemorySpace , class SrcMemorySpace > +struct DeepCopy ; + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +/** \class ViewTraits + * \brief Traits class for accessing attributes of a View. + * + * This is an implementation detail of View. It is only of interest + * to developers implementing a new specialization of View. + * + * Template argument permutations: + * - View< DataType , void , void , void > + * - View< DataType , Space , void , void > + * - View< DataType , Space , MemoryTraits , void > + * - View< DataType , Space , void , MemoryTraits > + * - View< DataType , ArrayLayout , void , void > + * - View< DataType , ArrayLayout , Space , void > + * - View< DataType , ArrayLayout , MemoryTraits , void > + * - View< DataType , ArrayLayout , Space , MemoryTraits > + * - View< DataType , MemoryTraits , void , void > + */ + +template< class DataType , + class Arg1 = void , + class Arg2 = void , + class Arg3 = void > +class ViewTraits { +private: + + // Layout, Space, and MemoryTraits are optional + // but need to appear in that order. That means Layout + // can only be Arg1, Space can be Arg1 or Arg2, and + // MemoryTraits can be Arg1, Arg2 or Arg3 + + enum { Arg1IsLayout = Kokkos::Impl::is_array_layout<Arg1>::value }; + + enum { Arg1IsSpace = Kokkos::Impl::is_space<Arg1>::value }; + enum { Arg2IsSpace = Kokkos::Impl::is_space<Arg2>::value }; + + enum { Arg1IsMemoryTraits = Kokkos::Impl::is_memory_traits<Arg1>::value }; + enum { Arg2IsMemoryTraits = Kokkos::Impl::is_memory_traits<Arg2>::value }; + enum { Arg3IsMemoryTraits = Kokkos::Impl::is_memory_traits<Arg3>::value }; + + enum { Arg1IsVoid = std::is_same< Arg1 , void >::value }; + enum { Arg2IsVoid = std::is_same< Arg2 , void >::value }; + enum { Arg3IsVoid = std::is_same< Arg3 , void >::value }; + + static_assert( 1 == Arg1IsLayout + Arg1IsSpace + Arg1IsMemoryTraits + Arg1IsVoid + , "Template argument #1 must be layout, space, traits, or void" ); + + // If Arg1 is Layout then Arg2 is Space, MemoryTraits, or void + // If Arg1 is Space then Arg2 is MemoryTraits or void + // If Arg1 is MemoryTraits then Arg2 is void + // If Arg1 is Void then Arg2 is void + + static_assert( ( Arg1IsLayout && ( 1 == Arg2IsSpace + Arg2IsMemoryTraits + Arg2IsVoid ) ) || + ( Arg1IsSpace && ( 0 == Arg2IsSpace ) && ( 1 == Arg2IsMemoryTraits + Arg2IsVoid ) ) || + ( Arg1IsMemoryTraits && Arg2IsVoid ) || + ( Arg1IsVoid && Arg2IsVoid ) + , "Template argument #2 must be space, traits, or void" ); + + // Arg3 is MemoryTraits or void and at most one argument is MemoryTraits + static_assert( ( 1 == Arg3IsMemoryTraits + Arg3IsVoid ) && + ( Arg1IsMemoryTraits + Arg2IsMemoryTraits + Arg3IsMemoryTraits <= 1 ) + , "Template argument #3 must be traits or void" ); + + typedef + typename std::conditional< Arg1IsSpace , Arg1 , + typename std::conditional< Arg2IsSpace , Arg2 , Kokkos::DefaultExecutionSpace + >::type >::type::execution_space + ExecutionSpace ; + + typedef + typename std::conditional< Arg1IsSpace , Arg1 , + typename std::conditional< Arg2IsSpace , Arg2 , Kokkos::DefaultExecutionSpace + >::type >::type::memory_space + MemorySpace ; + + typedef + typename Kokkos::Impl::is_space< + typename std::conditional< Arg1IsSpace , Arg1 , + typename std::conditional< Arg2IsSpace , Arg2 , Kokkos::DefaultExecutionSpace + >::type >::type >::host_mirror_space + HostMirrorSpace ; + + typedef + typename std::conditional< Arg1IsLayout , Arg1 , typename ExecutionSpace::array_layout >::type + ArrayLayout ; + + // Arg1, Arg2, or Arg3 may be memory traits + typedef + typename std::conditional< Arg1IsMemoryTraits , Arg1 , + typename std::conditional< Arg2IsMemoryTraits , Arg2 , + typename std::conditional< Arg3IsMemoryTraits , Arg3 , MemoryManaged + >::type >::type >::type + MemoryTraits ; + + typedef Kokkos::Experimental::Impl::ViewDataAnalysis< DataType > analysis ; + +public: + + //------------------------------------ + // Data type traits: + + typedef typename analysis::type data_type ; + typedef typename analysis::const_type const_data_type ; + typedef typename analysis::non_const_type non_const_data_type ; + + //------------------------------------ + // Compatible array of trivial type traits: + + typedef typename analysis::array_scalar_type array_scalar_type ; + typedef typename analysis::const_array_scalar_type const_array_scalar_type ; + typedef typename analysis::non_const_array_scalar_type non_const_array_scalar_type ; + + //------------------------------------ + // Value type traits: + + typedef typename analysis::value_type value_type ; + typedef typename analysis::const_value_type const_value_type ; + typedef typename analysis::non_const_value_type non_const_value_type ; + + //------------------------------------ + // Mapping traits: + + typedef ArrayLayout array_layout ; + typedef typename analysis::dimension dimension ; + typedef typename analysis::specialize specialize /* mapping specialization tag */ ; + + enum { rank = dimension::rank }; + enum { rank_dynamic = dimension::rank_dynamic }; + + //------------------------------------ + // Execution space, memory space, memory access traits, and host mirror space. + + typedef ExecutionSpace execution_space ; + typedef MemorySpace memory_space ; + typedef Device<ExecutionSpace,MemorySpace> device_type ; + typedef MemoryTraits memory_traits ; + typedef HostMirrorSpace host_mirror_space ; + + typedef typename memory_space::size_type size_type ; + + enum { is_hostspace = std::is_same< memory_space , HostSpace >::value }; + enum { is_managed = memory_traits::Unmanaged == 0 }; + enum { is_random_access = memory_traits::RandomAccess == 1 }; + + //------------------------------------ +}; + +/** \class View + * \brief View to an array of data. + * + * A View represents an array of one or more dimensions. + * For details, please refer to Kokkos' tutorial materials. + * + * \section Kokkos_View_TemplateParameters Template parameters + * + * This class has both required and optional template parameters. The + * \c DataType parameter must always be provided, and must always be + * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are + * placeholders for different template parameters. The default value + * of the fifth template parameter \c Specialize suffices for most use + * cases. When explaining the template parameters, we won't refer to + * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer + * to the valid categories of template parameters, in whatever order + * they may occur. + * + * Valid ways in which template arguments may be specified: + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , Space , void , MemoryTraits > + * - View< DataType , Layout , Space > + * - View< DataType , Layout , Space , MemoryTraits > + * + * \tparam DataType (required) This indicates both the type of each + * entry of the array, and the combination of compile-time and + * run-time array dimension(s). For example, <tt>double*</tt> + * indicates a one-dimensional array of \c double with run-time + * dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int + * with run-time first dimension and compile-time second dimension + * (of 3). In general, the run-time dimensions (if any) must go + * first, followed by zero or more compile-time dimensions. For + * more examples, please refer to the tutorial materials. + * + * \tparam Space (required) The memory space. + * + * \tparam Layout (optional) The array's layout in memory. For + * example, LayoutLeft indicates a column-major (Fortran style) + * layout, and LayoutRight a row-major (C style) layout. If not + * specified, this defaults to the preferred layout for the + * <tt>Space</tt>. + * + * \tparam MemoryTraits (optional) Assertion of the user's intended + * access behavior. For example, RandomAccess indicates read-only + * access with limited spatial locality, and Unmanaged lets users + * wrap externally allocated memory in a View without automatic + * deallocation. + * + * \section Kokkos_View_MT MemoryTraits discussion + * + * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on Space + * + * Some \c MemoryTraits options may have different interpretations for + * different \c Space types. For example, with the Cuda device, + * \c RandomAccess tells Kokkos to fetch the data through the texture + * cache, whereas the non-GPU devices have no such hardware construct. + * + * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits + * + * Users should defer applying the optional \c MemoryTraits parameter + * until the point at which they actually plan to rely on it in a + * computational kernel. This minimizes the number of template + * parameters exposed in their code, which reduces the cost of + * compilation. Users may always assign a View without specified + * \c MemoryTraits to a compatible View with that specification. + * For example: + * \code + * // Pass in the simplest types of View possible. + * void + * doSomething (View<double*, Cuda> out, + * View<const double*, Cuda> in) + * { + * // Assign the "generic" View in to a RandomAccess View in_rr. + * // Note that RandomAccess View objects must have const data. + * View<const double*, Cuda, RandomAccess> in_rr = in; + * // ... do something with in_rr and out ... + * } + * \endcode + */ +template< class DataType + , class Arg1 = void /* ArrayLayout, SpaceType, or MemoryTraits */ + , class Arg2 = void /* SpaceType or MemoryTraits */ + , class Arg3 = void /* MemoryTraits */ > +class View ; + +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include <impl/KokkosExp_ViewMapping.hpp> +#include <impl/KokkosExp_ViewAllocProp.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +namespace { + +constexpr Kokkos::Experimental::Impl::ALL_t + ALL = Kokkos::Experimental::Impl::ALL_t(); + +constexpr Kokkos::Experimental::Impl::WithoutInitializing_t + WithoutInitializing = Kokkos::Experimental::Impl::WithoutInitializing_t(); + +constexpr Kokkos::Experimental::Impl::AllowPadding_t + AllowPadding = Kokkos::Experimental::Impl::AllowPadding_t(); + +} + +/** \brief Create View allocation parameter bundle from argument list. + * + * Valid argument list members are: + * 1) label as a "string" or std::string + * 2) memory space instance of the View::memory_space type + * 3) execution space instance compatible with the View::memory_space + * 4) Kokkos::WithoutInitializing to bypass initialization + * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory alignment + */ +template< class ... Args > +inline +Kokkos::Experimental::Impl::ViewAllocProp< Args ... > +view_alloc( Args ... args ) +{ + return Kokkos::Experimental::Impl::ViewAllocProp< Args ... >( args ... ); +} + +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +/**\brief Each R? template argument designates whether the subview argument is a range */ +template< class V + , bool R0 = false , bool R1 = false , bool R2 = false , bool R3 = false + , bool R4 = false , bool R5 = false , bool R6 = false , bool R7 = false > +using Subview = typename Kokkos::Experimental::Impl::SubviewType< V, R0 , R1 , R2 , R3 , R4 , R5 , R6 , R7 >::type ; + +template< class DataType , class Arg1 , class Arg2 , class Arg3 > +class View : public ViewTraits< DataType , Arg1 , Arg2 , Arg3 > { +private: + + template< class , class , class , class > friend class View ; + + typedef ViewTraits< DataType , Arg1 , Arg2 , Arg3 > traits ; + typedef Kokkos::Experimental::Impl::ViewMapping< traits > map_type ; + typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ; + + track_type m_track ; + map_type m_map ; + +public: + + //---------------------------------------- + /** \brief Compatible view of array of scalar types */ + typedef View< typename traits::array_scalar_type , + typename traits::array_layout , + typename traits::device_type , + typename traits::memory_traits > + array_type ; + + /** \brief Compatible view of const data type */ + typedef View< typename traits::const_data_type , + typename traits::array_layout , + typename traits::device_type , + typename traits::memory_traits > + const_type ; + + /** \brief Compatible view of non-const data type */ + typedef View< typename traits::non_const_data_type , + typename traits::array_layout , + typename traits::device_type , + typename traits::memory_traits > + non_const_type ; + + /** \brief Compatible HostMirror view */ + typedef View< typename traits::non_const_data_type , + typename traits::array_layout , + typename traits::host_mirror_space , + void > + HostMirror ; + + //---------------------------------------- + // Domain dimensions + + enum { Rank = map_type::Rank }; + + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); } + + //---------------------------------------- + // Range span + + typedef typename map_type::reference_type reference_type ; + + enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value }; + + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_map.span_is_contiguous(); } + KOKKOS_INLINE_FUNCTION constexpr typename traits::value_type * data() const { return m_map.data(); } + + // Deprecated, use 'span_is_contigous()' instead + KOKKOS_INLINE_FUNCTION constexpr bool is_contiguous() const { return m_map.span_is_contiguous(); } + // Deprecated, use 'data()' instead + KOKKOS_INLINE_FUNCTION constexpr typename traits::value_type * ptr_on_device() const { return m_map.data(); } + + //---------------------------------------- + +private: + + typedef typename + std::conditional< Rank == 0 , reference_type + , Kokkos::Experimental::Impl::Error_view_scalar_reference_to_non_scalar_view >::type + scalar_operator_reference_type ; + + typedef typename + std::conditional< Rank == 0 , const int + , Kokkos::Experimental::Impl::Error_view_scalar_reference_to_non_scalar_view >::type + scalar_operator_index_type ; + +public: + + // Rank == 0 + + KOKKOS_FORCEINLINE_FUNCTION + scalar_operator_reference_type operator()() const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, 0, 0, 0, 0, 0, 0, 0, 0 ); + return scalar_operator_reference_type( m_map.reference() ); + } + + KOKKOS_FORCEINLINE_FUNCTION + reference_type + operator()( scalar_operator_index_type i0 + , const int i1 = 0 , const int i2 = 0 , const int i3 = 0 + , const int i4 = 0 , const int i5 = 0 , const int i6 = 0 , const int i7 = 0 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 ); + return m_map.reference(); + } + + // Rank == 1 + + template< typename I0 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( Rank == 1 && std::is_integral<I0>::value + ), reference_type >::type + operator[]( const I0 & i0 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, 0, 0, 0, 0, 0, 0, 0 ); + return m_map.reference(i0); + } + + template< typename I0 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( Rank == 1 && std::is_integral<I0>::value + ), reference_type >::type + operator()( const I0 & i0 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, 0, 0, 0, 0, 0, 0, 0 ); + return m_map.reference(i0); + } + + template< typename I0 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type + operator()( const I0 & i0 + , typename std::enable_if<( Rank == 1 && std::is_integral<I0>::value ), const int >::type i1 + , const int i2 = 0 , const int i3 = 0 + , const int i4 = 0 , const int i5 = 0 , const int i6 = 0 , const int i7 = 0 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 ); + return m_map.reference(i0); + } + + // Rank == 2 + + template< typename I0 , typename I1 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( Rank == 2 && + std::is_integral<I0>::value && + std::is_integral<I1>::value + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, 0, 0, 0, 0, 0, 0 ); + return m_map.reference(i0,i1); + } + + template< typename I0 , typename I1 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type + operator()( const I0 & i0 , const I1 & i1 + , typename std::enable_if<( Rank == 2 && + std::is_integral<I0>::value && + std::is_integral<I1>::value + ), const int >::type i2 + , const int i3 = 0 + , const int i4 = 0 , const int i5 = 0 , const int i6 = 0 , const int i7 = 0 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 ); + return m_map.reference(i0,i1); + } + + // Rank == 3 + + template< typename I0 , typename I1 , typename I2 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( Rank == 3 && + std::is_integral<I0>::value && + std::is_integral<I1>::value && + std::is_integral<I2>::value + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, 0, 0, 0, 0, 0 ); + return m_map.reference(i0,i1,i2); + } + + template< typename I0 , typename I1 , typename I2 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 + , typename std::enable_if<( Rank == 3 && + std::is_integral<I0>::value && + std::is_integral<I1>::value && + std::is_integral<I2>::value + ), const int >::type i3 + , const int i4 = 0 , const int i5 = 0 , const int i6 = 0 , const int i7 = 0 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 ); + return m_map.reference(i0,i1,i2); + } + + // Rank == 4 + + template< typename I0 , typename I1 , typename I2 , typename I3 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( Rank == 4 && + std::is_integral<I0>::value && + std::is_integral<I1>::value && + std::is_integral<I2>::value && + std::is_integral<I3>::value + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, 0, 0, 0, 0 ); + return m_map.reference(i0,i1,i2,i3); + } + + template< typename I0 , typename I1 , typename I2 , typename I3 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , typename std::enable_if<( Rank == 4 && + std::is_integral<I0>::value && + std::is_integral<I1>::value && + std::is_integral<I2>::value && + std::is_integral<I3>::value + ), const int >::type i4 + , const int i5 = 0 , const int i6 = 0 , const int i7 = 0 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 ); + return m_map.reference(i0,i1,i2,i3); + } + + // Rank == 5 + + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( Rank == 5 && + std::is_integral<I0>::value && + std::is_integral<I1>::value && + std::is_integral<I2>::value && + std::is_integral<I3>::value && + std::is_integral<I4>::value + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, 0, 0, 0 ); + return m_map.reference(i0,i1,i2,i3,i4); + } + + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 + , typename std::enable_if<( Rank == 5 && + std::is_integral<I0>::value && + std::is_integral<I1>::value && + std::is_integral<I2>::value && + std::is_integral<I3>::value && + std::is_integral<I4>::value + ), const int >::type i5 + , const int i6 = 0 , const int i7 = 0 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 ); + return m_map.reference(i0,i1,i2,i3,i4); + } + + // Rank == 6 + + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( Rank == 6 && + std::is_integral<I0>::value && + std::is_integral<I1>::value && + std::is_integral<I2>::value && + std::is_integral<I3>::value && + std::is_integral<I4>::value && + std::is_integral<I5>::value + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, 0, 0 ); + return m_map.reference(i0,i1,i2,i3,i4,i5); + } + + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 + , typename std::enable_if<( Rank == 6 && + std::is_integral<I0>::value && + std::is_integral<I1>::value && + std::is_integral<I2>::value && + std::is_integral<I3>::value && + std::is_integral<I4>::value + ), const int >::type i6 + , const int i7 = 0 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 ); + return m_map.reference(i0,i1,i2,i3,i4,i5); + } + + // Rank == 7 + + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 , typename I6 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( Rank == 7 && + std::is_integral<I0>::value && + std::is_integral<I1>::value && + std::is_integral<I2>::value && + std::is_integral<I3>::value && + std::is_integral<I4>::value && + std::is_integral<I5>::value && + std::is_integral<I6>::value + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 , const I6 & i6 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, 0 ); + return m_map.reference(i0,i1,i2,i3,i4,i5,i6); + } + + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 , typename I6 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 , const I6 & i6 + , typename std::enable_if<( Rank == 7 && + std::is_integral<I0>::value && + std::is_integral<I1>::value && + std::is_integral<I2>::value && + std::is_integral<I3>::value && + std::is_integral<I4>::value + ), const int >::type i7 + ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 ); + return m_map.reference(i0,i1,i2,i3,i4,i5,i6); + } + + // Rank == 8 + + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 , typename I6 , typename I7 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<( Rank == 8 && + std::is_integral<I0>::value && + std::is_integral<I1>::value && + std::is_integral<I2>::value && + std::is_integral<I3>::value && + std::is_integral<I4>::value && + std::is_integral<I5>::value && + std::is_integral<I6>::value && + std::is_integral<I7>::value + ), reference_type >::type + operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 ) const + { + KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 ); + return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7); + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + ~View() {} + + KOKKOS_INLINE_FUNCTION + View() : m_track(), m_map() {} + + KOKKOS_INLINE_FUNCTION + View( const View & rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {} + + KOKKOS_INLINE_FUNCTION + View( View && rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {} + + KOKKOS_INLINE_FUNCTION + View & operator = ( const View & rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; } + + KOKKOS_INLINE_FUNCTION + View & operator = ( View && rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; } + + //---------------------------------------- + + template< class RT , class R1 , class R2 , class R3 > + KOKKOS_INLINE_FUNCTION + View( const View<RT,R1,R2,R3> & rhs ) + : m_track( rhs.m_track ) + , m_map() + { + typedef typename View<RT,R1,R2,R3>::traits SrcTraits ; + typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits > Mapping ; + static_assert( Mapping::is_assignable , "Incompatible View copy construction" ); + Mapping::assign( m_map , rhs.m_map , rhs.m_track ); + } + + template< class RT , class R1 , class R2 , class R3 > + KOKKOS_INLINE_FUNCTION + View( View<RT,R1,R2,R3> && rhs ) + : m_track( rhs.m_track ) + , m_map() + { + typedef typename View<RT,R1,R2,R3>::traits SrcTraits ; + typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits > Mapping ; + static_assert( Mapping::is_assignable , "Incompatible View move construction" ); + Mapping::assign( m_map , rhs.m_map , rhs.m_track ); + } + + template< class RT , class R1 , class R2 , class R3 > + KOKKOS_INLINE_FUNCTION + View & operator = ( const View<RT,R1,R2,R3> & rhs ) + { + typedef typename View<RT,R1,R2,R3>::traits SrcTraits ; + typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits > Mapping ; + static_assert( Mapping::is_assignable , "Incompatible View copy assignment" ); + Mapping::assign( m_map , rhs.m_map , rhs.m_track ); + m_track.operator=( rhs.m_track ); + return *this ; + } + + template< class RT , class R1 , class R2 , class R3 > + KOKKOS_INLINE_FUNCTION + View & operator = ( View<RT,R1,R2,R3> && rhs ) + { + typedef typename View<RT,R1,R2,R3>::traits SrcTraits ; + typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits > Mapping ; + static_assert( Mapping::is_assignable , "Incompatible View move assignment" ); + Mapping::assign( m_map , rhs.m_map , rhs.m_track ); + m_track.operator=( rhs.m_track ); + return *this ; + } + + //---------------------------------------- + // Allocation according to allocation properties + +private: + + // Must call destructor for non-trivial types + template< class ExecSpace > + struct DestroyFunctor { + map_type m_map ; + ExecSpace m_space ; + + KOKKOS_INLINE_FUNCTION + void destroy_shared_allocation() { m_map.destroy( m_space ); } + }; + +public: + + inline + const std::string label() const { return m_track.template get_label< typename traits::memory_space >(); } + + template< class Prop > + explicit inline + View( const Prop & arg_prop + , const size_t arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 + ) + : m_track() + , m_map() + { + // Merge the < execution_space , memory_space > into the properties. + typedef Kokkos::Experimental::Impl::ViewAllocProp< typename traits::device_type , Prop > alloc_prop ; + + typedef typename alloc_prop::execution_space execution_space ; + typedef typename traits::memory_space memory_space ; + typedef DestroyFunctor< execution_space > destroy_functor ; + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< memory_space , destroy_functor > record_type ; + + static_assert( traits::is_managed , "View allocation constructor requires managed memory" ); + + const alloc_prop prop( arg_prop ); + + // Query the mapping for byte-size of allocation. + const size_t alloc_size = map_type::memory_span( prop.allow_padding + , arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ); + + // Allocate memory from the memory space. + record_type * const record = record_type::allocate( prop.memory , prop.label , alloc_size ); + + // Construct the mapping object prior to start of tracking + // to assign destroy functor and possibly initialize. + m_map = map_type( record->data() + , prop.allow_padding + , arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ); + + // Copy the destroy functor into the allocation record before initiating tracking. + record->m_destroy.m_map = m_map ; + record->m_destroy.m_space = prop.execution ; + + if ( prop.initialize.value ) { + m_map.construct( prop.execution ); + } + + // Destroy functor assigned and initialization complete, start tracking + m_track = track_type( record ); + } + + template< class Prop > + explicit inline + View( const Prop & arg_prop + , const typename traits::array_layout & arg_layout + ) + : m_track() + , m_map() + { + // Merge the < execution_space , memory_space > into the properties. + typedef Kokkos::Experimental::Impl::ViewAllocProp< typename traits::device_type , Prop > alloc_prop ; + + typedef typename alloc_prop::execution_space execution_space ; + typedef typename traits::memory_space memory_space ; + typedef DestroyFunctor< execution_space > destroy_functor ; + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< memory_space , destroy_functor > record_type ; + + static_assert( traits::is_managed , "View allocation constructor requires managed memory" ); + + const alloc_prop prop( arg_prop ); + + // Query the mapping for byte-size of allocation. + const size_t alloc_size = map_type::memory_span( prop.allow_padding , arg_layout ); + + // Allocate memory from the memory space. + record_type * const record = record_type::allocate( prop.memory , prop.label , alloc_size ); + + // Construct the mapping object prior to start of tracking + // to assign destroy functor and possibly initialize. + m_map = map_type( record->data() , prop.allow_padding , arg_layout ); + + // Copy the destroy functor into the allocation record before initiating tracking. + record->m_destroy.m_map = m_map ; + record->m_destroy.m_space = prop.execution ; + + if ( prop.initialize.value ) { + m_map.construct( prop.execution ); + } + + // Destroy functor assigned and initialization complete, start tracking + m_track = track_type( record ); + } + + //---------------------------------------- + // Memory span required to wrap these dimensions. + static constexpr size_t memory_span( const size_t arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 + ) + { + return map_type::memory_span( std::integral_constant<bool,false>() + , arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ); + } + + explicit inline + View( typename traits::value_type * const arg_ptr + , const size_t arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 + ) + : m_track() // No memory tracking + , m_map( arg_ptr , std::integral_constant<bool,false>() + , arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) + {} + + explicit inline + View( typename traits::value_type * const arg_ptr + , typename traits::array_layout & arg_layout + ) + : m_track() // No memory tracking + , m_map( arg_ptr , std::integral_constant<bool,false>(), arg_layout ) + {} + + //---------------------------------------- + // Shared scratch memory constructor + + static inline + size_t shmem_size( const size_t arg_N0 = 0 , + const size_t arg_N1 = 0 , + const size_t arg_N2 = 0 , + const size_t arg_N3 = 0 , + const size_t arg_N4 = 0 , + const size_t arg_N5 = 0 , + const size_t arg_N6 = 0 , + const size_t arg_N7 = 0 ) + { + return map_type::memory_span( std::integral_constant<bool,false>() + , arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ); + } + + explicit KOKKOS_INLINE_FUNCTION + View( const typename traits::execution_space::scratch_memory_space & arg_space + , const size_t arg_N0 = 0 + , const size_t arg_N1 = 0 + , const size_t arg_N2 = 0 + , const size_t arg_N3 = 0 + , const size_t arg_N4 = 0 + , const size_t arg_N5 = 0 + , const size_t arg_N6 = 0 + , const size_t arg_N7 = 0 ) + : m_track() // No memory tracking + , m_map( arg_space.get_shmem( map_type::memory_span( std::integral_constant<bool,false>() + , arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) + , std::integral_constant<bool,false>() + , arg_N0 , arg_N1 , arg_N2 , arg_N3 + , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) + {} + + //---------------------------------------- + // Subviews + +private: + + explicit KOKKOS_INLINE_FUNCTION + View( const track_type & rhs ) + : m_track( rhs ) + , m_map() + {} + +public: + + template< class D , class A1 , class A2 , class A3 + , class T0 , class T1 , class T2 , class T3 + , class T4 , class T5 , class T6 , class T7 > + friend + KOKKOS_INLINE_FUNCTION + Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T7>::is_range + > + subview( const View< D , A1 , A2 , A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3 + , T4 const & arg4 , T5 const & arg5 , T6 const & arg6 , T7 const & arg7 + ); + + template< class D , class A1 , class A2 , class A3 + , class T0 , class T1 , class T2 , class T3 + , class T4 , class T5 , class T6 > + friend + KOKKOS_INLINE_FUNCTION + Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range + > + subview( const View< D , A1 , A2 , A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3 + , T4 const & arg4 , T5 const & arg5 , T6 const & arg6 + ); + + template< class D , class A1 , class A2 , class A3 + , class T0 , class T1 , class T2 , class T3 + , class T4 , class T5 > + friend + KOKKOS_INLINE_FUNCTION + Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range + > + subview( const View< D , A1 , A2 , A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3 + , T4 const & arg4 , T5 const & arg5 + ); + + template< class D , class A1 , class A2 , class A3 + , class T0 , class T1 , class T2 , class T3 + , class T4 > + friend + KOKKOS_INLINE_FUNCTION + Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + > + subview( const View< D , A1 , A2 , A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3 + , T4 const & arg4 + ); + + template< class D , class A1 , class A2 , class A3 + , class T0 , class T1 , class T2 , class T3 > + friend + KOKKOS_INLINE_FUNCTION + Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + > + subview( const View< D , A1 , A2 , A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3 + ); + + template< class D , class A1 , class A2 , class A3 + , class T0 , class T1 , class T2 > + friend + KOKKOS_INLINE_FUNCTION + Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + > + subview( const View< D , A1 , A2 , A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 + ); + + template< class D , class A1 , class A2 , class A3 + , class T0 , class T1 > + friend + KOKKOS_INLINE_FUNCTION + Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + > + subview( const View< D , A1 , A2 , A3 > & src + , T0 const & arg0 , T1 const & arg1 + ); + + template< class D, class A1, class A2, class A3, class T0 > + friend + KOKKOS_INLINE_FUNCTION + Kokkos::Experimental::Subview< View< D, A1, A2, A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + > + subview( const View< D, A1, A2, A3 > & src , T0 const & arg0 ); + +}; + +template< class > struct is_view : public std::false_type {}; + +template< class D, class A1, class A2, class A3 > +struct is_view< View<D,A1,A2,A3> > : public std::true_type {}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template< class D, class A1, class A2, class A3 + , class T0 , class T1 , class T2 , class T3 + , class T4 , class T5 , class T6 , class T7 > +KOKKOS_INLINE_FUNCTION +Kokkos::Experimental::Subview< View< D, A1, A2, A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T7>::is_range + > +subview( const View< D, A1, A2, A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3 + , T4 const & arg4 , T5 const & arg5 , T6 const & arg6 , T7 const & arg7 + ) +{ + typedef View< D, A1, A2, A3 > SrcView ; + + typedef Kokkos::Experimental::Impl::SubviewMapping + < typename SrcView::traits + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T7>::is_range + > Mapping ; + + typedef typename Mapping::type DstView ; + + static_assert( SrcView::Rank == 8 , "Subview of rank 8 View requires 8 arguments" ); + + DstView dst( src.m_track ); + + Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 ); + + return dst ; +} + +template< class D, class A1, class A2, class A3 + , class T0 , class T1 , class T2 , class T3 + , class T4 , class T5 , class T6 > +KOKKOS_INLINE_FUNCTION +Kokkos::Experimental::Subview< View< D, A1, A2, A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range + > +subview( const View< D, A1, A2, A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3 + , T4 const & arg4 , T5 const & arg5 , T6 const & arg6 + ) +{ + typedef View< D, A1, A2, A3 > SrcView ; + + typedef Kokkos::Experimental::Impl::SubviewMapping + < typename SrcView::traits + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range + > Mapping ; + + typedef typename Mapping::type DstView ; + + static_assert( SrcView::Rank == 7 , "Subview of rank 7 View requires 7 arguments" ); + + DstView dst( src.m_track ); + + Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, arg3, arg4, arg5, arg6, 0 ); + + return dst ; +} + +template< class D, class A1, class A2, class A3 + , class T0 , class T1 , class T2 , class T3 + , class T4 , class T5 > +KOKKOS_INLINE_FUNCTION +Kokkos::Experimental::Subview< View< D, A1, A2, A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range + > +subview( const View< D, A1, A2, A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3 + , T4 const & arg4 , T5 const & arg5 + ) +{ + typedef View< D, A1, A2, A3 > SrcView ; + + typedef Kokkos::Experimental::Impl::SubviewMapping + < typename SrcView::traits + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range + > Mapping ; + + typedef typename Mapping::type DstView ; + + static_assert( SrcView::Rank == 6 , "Subview of rank 6 View requires 6 arguments" ); + + DstView dst( src.m_track ); + + Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, arg3, arg4, arg5, 0, 0 ); + + return dst ; +} + +template< class D, class A1, class A2, class A3 + , class T0 , class T1 , class T2 , class T3 + , class T4 > +KOKKOS_INLINE_FUNCTION +Kokkos::Experimental::Subview< View< D, A1, A2, A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + > +subview( const View< D, A1, A2, A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3 + , T4 const & arg4 + ) +{ + typedef View< D, A1, A2, A3 > SrcView ; + + typedef Kokkos::Experimental::Impl::SubviewMapping + < typename SrcView::traits + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range + > Mapping ; + + typedef typename Mapping::type DstView ; + + static_assert( SrcView::Rank == 5 , "Subview of rank 5 View requires 5 arguments" ); + + DstView dst( src.m_track ); + + Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, arg3, arg4, 0, 0, 0 ); + + return dst ; +} + +template< class D, class A1, class A2, class A3 + , class T0 , class T1 , class T2 , class T3 > +KOKKOS_INLINE_FUNCTION +Kokkos::Experimental::Subview< View< D, A1, A2, A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + > +subview( const View< D, A1, A2, A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3 + ) +{ + typedef View< D, A1, A2, A3 > SrcView ; + + typedef Kokkos::Experimental::Impl::SubviewMapping + < typename SrcView::traits + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range + > Mapping ; + + typedef typename Mapping::type DstView ; + + static_assert( SrcView::Rank == 4 , "Subview of rank 4 View requires 4 arguments" ); + + DstView dst( src.m_track ); + + Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, arg3, 0, 0, 0, 0 ); + + return dst ; +} + +template< class D, class A1, class A2, class A3 + , class T0 , class T1 , class T2 > +KOKKOS_INLINE_FUNCTION +Kokkos::Experimental::Subview< View< D, A1, A2, A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + > +subview( const View< D, A1, A2, A3 > & src + , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 + ) +{ + typedef View< D, A1, A2, A3 > SrcView ; + + typedef Kokkos::Experimental::Impl::SubviewMapping + < typename SrcView::traits + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range + > Mapping ; + + typedef typename Mapping::type DstView ; + + static_assert( SrcView::Rank == 3 , "Subview of rank 3 View requires 3 arguments" ); + + DstView dst( src.m_track ); + + Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, 0, 0, 0, 0, 0 ); + + return dst ; +} + +template< class D, class A1, class A2, class A3 + , class T0 , class T1 > +KOKKOS_INLINE_FUNCTION +Kokkos::Experimental::Subview< View< D, A1, A2, A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + > +subview( const View< D, A1, A2, A3 > & src + , T0 const & arg0 , T1 const & arg1 + ) +{ + typedef View< D, A1, A2, A3 > SrcView ; + + typedef Kokkos::Experimental::Impl::SubviewMapping + < typename SrcView::traits + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range + > Mapping ; + + typedef typename Mapping::type DstView ; + + static_assert( SrcView::Rank == 2 , "Subview of rank 2 View requires 2 arguments" ); + + DstView dst( src.m_track ); + + Mapping::assign( dst.m_map, src.m_map, arg0, arg1, 0, 0, 0, 0, 0, 0 ); + + return dst ; +} + +template< class D, class A1, class A2, class A3, class T0 > +KOKKOS_INLINE_FUNCTION +Kokkos::Experimental::Subview< View< D, A1, A2, A3 > + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + > +subview( const View< D, A1, A2, A3 > & src , T0 const & arg0 ) +{ + typedef View< D, A1, A2, A3 > SrcView ; + + typedef Kokkos::Experimental::Impl::SubviewMapping + < typename SrcView::traits + , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range + > Mapping ; + + typedef typename Mapping::type DstView ; + + static_assert( SrcView::Rank == 1 , "Subview of rank 1 View requires 1 arguments" ); + + DstView dst( src.m_track ); + + Mapping::assign( dst.m_map , src.m_map , arg0, 0, 0, 0, 0, 0, 0, 0 ); + + return dst ; +} + +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +template< class LT , class L1 , class L2 , class L3 + , class RT , class R1 , class R2 , class R3 > +KOKKOS_INLINE_FUNCTION +bool operator == ( const View<LT,L1,L2,L3> & lhs , + const View<RT,R1,R2,R3> & rhs ) +{ + // Same data, layout, dimensions + typedef ViewTraits<LT,L1,L2,L3> lhs_traits ; + typedef ViewTraits<RT,R1,R2,R3> rhs_traits ; + + return + std::is_same< typename lhs_traits::const_value_type , + typename rhs_traits::const_value_type >::value && + std::is_same< typename lhs_traits::array_layout , + typename rhs_traits::array_layout >::value && + std::is_same< typename lhs_traits::memory_space , + typename rhs_traits::memory_space >::value && + lhs_traits::Rank == rhs_traits::Rank && + lhs.data() == rhs.data() && + lhs.span() == rhs.span() && + lhs.dimension_0() == rhs.dimension_0() && + lhs.dimension_1() == rhs.dimension_1() && + lhs.dimension_2() == rhs.dimension_2() && + lhs.dimension_3() == rhs.dimension_3() && + lhs.dimension_4() == rhs.dimension_4() && + lhs.dimension_5() == rhs.dimension_5() && + lhs.dimension_6() == rhs.dimension_6() && + lhs.dimension_7() == rhs.dimension_7(); +} + +template< class LT , class L1 , class L2 , class L3 + , class RT , class R1 , class R2 , class R3 > +KOKKOS_INLINE_FUNCTION +bool operator != ( const View<LT,L1,L2,L3> & lhs , + const View<RT,R1,R2,R3> & rhs ) +{ + return ! ( operator==(lhs,rhs) ); +} + +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template< class OutputView , typename Enable = void > +struct ViewFill { + + typedef typename OutputView::const_value_type const_value_type ; + + const OutputView output ; + const_value_type input ; + + KOKKOS_INLINE_FUNCTION + void operator()( const size_t i0 ) const + { + const size_t n1 = output.dimension_1(); + const size_t n2 = output.dimension_2(); + const size_t n3 = output.dimension_3(); + const size_t n4 = output.dimension_4(); + const size_t n5 = output.dimension_5(); + const size_t n6 = output.dimension_6(); + const size_t n7 = output.dimension_7(); + + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) { + for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) { + for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) { + for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) { + for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) { + output(i0,i1,i2,i3,i4,i5,i6,i7) = input ; + }}}}}}} + } + + ViewFill( const OutputView & arg_out , const_value_type & arg_in ) + : output( arg_out ), input( arg_in ) + { + typedef typename OutputView::execution_space execution_space ; + typedef Kokkos::RangePolicy< execution_space > Policy ; + + (void) Kokkos::Impl::ParallelFor< ViewFill , Policy >( *this , Policy( 0 , output.dimension_0() ) ); + + execution_space::fence(); + } +}; + +template< class OutputView > +struct ViewFill< OutputView , typename std::enable_if< OutputView::Rank == 0 >::type > { + ViewFill( const OutputView & dst , const typename OutputView::const_value_type & src ) + { + Kokkos::Impl::DeepCopy< typename OutputView::memory_space , Kokkos::HostSpace > + ( dst.data() , & src , sizeof(typename OutputView::const_value_type) ); + } +}; + +template< class OutputView , class InputView > +struct ViewRemap { + + const OutputView output ; + const InputView input ; + const size_t n0 ; + const size_t n1 ; + const size_t n2 ; + const size_t n3 ; + const size_t n4 ; + const size_t n5 ; + const size_t n6 ; + const size_t n7 ; + + ViewRemap( const OutputView & arg_out , const InputView & arg_in ) + : output( arg_out ), input( arg_in ) + , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) ) + , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) ) + , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) ) + , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) ) + , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) ) + , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) ) + , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) ) + , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) ) + { + typedef typename OutputView::execution_space execution_space ; + typedef Kokkos::RangePolicy< execution_space > Policy ; + (void) Kokkos::Impl::ParallelFor< ViewRemap , Policy >( *this , Policy( 0 , n0 ) ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_t i0 ) const + { + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) { + for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) { + for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) { + for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) { + for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) { + output(i0,i1,i2,i3,i4,i5,i6,i7) = input(i0,i1,i2,i3,i4,i5,i6,i7); + }}}}}}} + } +}; + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +/** \brief Deep copy a value from Host memory into a view. */ +template< class DT , class D1 , class D2 , class D3 > +inline +void deep_copy( const View<DT,D1,D2,D3> & dst + , typename ViewTraits<DT,D1,D2,D3>::const_value_type & value ) +{ + static_assert( std::is_same< typename ViewTraits<DT,D1,D2,D3>::non_const_value_type , + typename ViewTraits<DT,D1,D2,D3>::value_type >::value + , "ERROR: Incompatible deep_copy( View , value )" ); + + Kokkos::Experimental::Impl::ViewFill< View<DT,D1,D2,D3> >( dst , value ); +} + +/** \brief Deep copy into a value in Host memory from a view. */ +template< class ST , class S1 , class S2 , class S3 > +inline +void deep_copy( ST & dst , const View<ST,S1,S2,S3> & src ) +{ + static_assert( ViewTraits<ST,S1,S2,S3>::rank == 0 + , "ERROR: Non-rank-zero view in deep_copy( value , View )" ); + + typedef ViewTraits<ST,S1,S2,S3> src_traits ; + typedef typename src_traits::memory_space src_memory_space ; + Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) ); +} + +//---------------------------------------------------------------------------- +/** \brief A deep copy between views of compatible type, and rank zero. */ +template< class DT , class D1 , class D2 , class D3 + , class ST , class S1 , class S2 , class S3 > +inline +void deep_copy( const View<DT,D1,D2,D3> & dst , + const View<ST,S1,S2,S3> & src , + typename std::enable_if<( + // Rank zero: + ( unsigned(ViewTraits<DT,D1,D2,D3>::rank) == unsigned(0) ) && + ( unsigned(ViewTraits<ST,S1,S2,S3>::rank) == unsigned(0) ) && + // Same type and destination is not constant: + std::is_same< typename ViewTraits<DT,D1,D2,D3>::value_type , + typename ViewTraits<ST,S1,S2,S3>::non_const_value_type >::value + )>::type * = 0 ) +{ + typedef View<DT,D1,D2,D3> dst_type ; + typedef View<ST,S1,S2,S3> src_type ; + + typedef typename dst_type::value_type value_type ; + typedef typename dst_type::memory_space dst_memory_space ; + typedef typename src_type::memory_space src_memory_space ; + + if ( dst.data() != src.data() ) { + Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) ); + } +} + +//---------------------------------------------------------------------------- +/** \brief A deep copy between views of the default specialization, compatible type, + * same non-zero rank, same contiguous layout. + */ +template< class DT , class D1 , class D2 , class D3 , + class ST , class S1 , class S2 , class S3 > +inline +void deep_copy( const View<DT,D1,D2,D3> & dst , + const View<ST,S1,S2,S3> & src , + typename std::enable_if<( + // destination is non-const. + std::is_same< typename ViewTraits<DT,D1,D2,D3>::value_type , + typename ViewTraits<DT,D1,D2,D3>::non_const_value_type >::value + && + // Same non-zero rank: + ( unsigned(ViewTraits<DT,D1,D2,D3>::rank) != 0 ) + && + ( unsigned(ViewTraits<DT,D1,D2,D3>::rank) == + unsigned(ViewTraits<ST,S1,S2,S3>::rank) ) + && + // Not specialized, default ViewMapping + std::is_same< typename ViewTraits<DT,D1,D2,D3>::specialize , void >::value + && + std::is_same< typename ViewTraits<ST,S1,S2,S3>::specialize , void >::value + )>::type * = 0 ) +{ + typedef View<DT,D1,D2,D3> dst_type ; + typedef View<ST,S1,S2,S3> src_type ; + + typedef typename dst_type::execution_space dst_execution_space ; + typedef typename dst_type::memory_space dst_memory_space ; + typedef typename src_type::memory_space src_memory_space ; + + enum { DstExecCanAccessSrc = + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value }; + + if ( (void *) dst.data() != (void*) src.data() ) { + + // Concern: If overlapping views then a parallel copy will be erroneous. + // ... + + // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy + + if ( std::is_same< typename ViewTraits<DT,D1,D2,D3>::value_type , + typename ViewTraits<ST,S1,S2,S3>::non_const_value_type >::value && + std::is_same< typename ViewTraits<DT,D1,D2,D3>::array_layout , + typename ViewTraits<ST,S1,S2,S3>::array_layout >::value && + dst.span_is_contiguous() && + src.span_is_contiguous() && + dst.span() == src.span() && + dst.dimension_0() == src.dimension_0() && + dst.dimension_1() == src.dimension_1() && + dst.dimension_2() == src.dimension_2() && + dst.dimension_3() == src.dimension_3() && + dst.dimension_4() == src.dimension_4() && + dst.dimension_5() == src.dimension_5() && + dst.dimension_6() == src.dimension_6() && + dst.dimension_7() == src.dimension_7() ) { + + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + + Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes ); + } + else if ( DstExecCanAccessSrc ) { + // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. + Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src ); + } + else { + Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation"); + } + } +} + +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +template< class T , class A1, class A2, class A3 > +inline +typename Kokkos::Experimental::View<T,A1,A2,A3>::HostMirror +create_mirror( const Kokkos::Experimental::View<T,A1,A2,A3> & src + , typename std::enable_if< + ! std::is_same< typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::array_layout + , Kokkos::LayoutStride >::value + >::type * = 0 + ) +{ + typedef View<T,A1,A2,A3> src_type ; + typedef typename src_type::HostMirror dst_type ; + + return dst_type( std::string( src.label() ).append("_mirror") + , src.dimension_0() + , src.dimension_1() + , src.dimension_2() + , src.dimension_3() + , src.dimension_4() + , src.dimension_5() + , src.dimension_6() + , src.dimension_7() ); +} + +template< class T , class A1, class A2, class A3 > +inline +typename Kokkos::Experimental::View<T,A1,A2,A3>::HostMirror +create_mirror( const Kokkos::Experimental::View<T,A1,A2,A3> & src + , typename std::enable_if< + std::is_same< typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::array_layout + , Kokkos::LayoutStride >::value + >::type * = 0 + ) +{ + typedef View<T,A1,A2,A3> src_type ; + typedef typename src_type::HostMirror dst_type ; + + Kokkos::LayoutStride layout ; + + layout.dimension[0] = src.dimension_0(); + layout.dimension[1] = src.dimension_1(); + layout.dimension[2] = src.dimension_2(); + layout.dimension[3] = src.dimension_3(); + layout.dimension[4] = src.dimension_4(); + layout.dimension[5] = src.dimension_5(); + layout.dimension[6] = src.dimension_6(); + layout.dimension[7] = src.dimension_7(); + + layout.stride[0] = src.stride_0(); + layout.stride[1] = src.stride_1(); + layout.stride[2] = src.stride_2(); + layout.stride[3] = src.stride_3(); + layout.stride[4] = src.stride_4(); + layout.stride[5] = src.stride_5(); + layout.stride[6] = src.stride_6(); + layout.stride[7] = src.stride_7(); + + return dst_type( std::string( src.label() ).append("_mirror") , layout ); +} + +template< class T , class A1 , class A2 , class A3 > +inline +typename Kokkos::Experimental::View<T,A1,A2,A3>::HostMirror +create_mirror_view( const Kokkos::Experimental::View<T,A1,A2,A3> & src + , typename std::enable_if<( + std::is_same< typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::memory_space + , typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::host_mirror_space + >::value + )>::type * = 0 + ) +{ + return src ; +} + +template< class T , class A1 , class A2 , class A3 > +inline +typename Kokkos::Experimental::View<T,A1,A2,A3>::HostMirror +create_mirror_view( const Kokkos::Experimental::View<T,A1,A2,A3> & src + , typename std::enable_if<( + ! std::is_same< typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::memory_space + , typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::host_mirror_space + >::value + )>::type * = 0 + ) +{ + return Kokkos::Experimental::create_mirror( src ); +} + +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +/** \brief Resize a view with copying old data to new data at the corresponding indices. */ +template< class T , class A1 , class A2 , class A3 > +inline +void resize( Kokkos::Experimental::View<T,A1,A2,A3> & v , + const size_t n0 = 0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 ) +{ + typedef Kokkos::Experimental::View<T,A1,A2,A3> view_type ; + + static_assert( Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::is_managed , "Can only resize managed views" ); + + view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 ); + + Kokkos::Experimental::Impl::ViewRemap< view_type , view_type >( v_resized , v ); + + v = v_resized ; +} + +/** \brief Resize a view with copying old data to new data at the corresponding indices. */ +template< class T , class A1 , class A2 , class A3 > +inline +void realloc( Kokkos::Experimental::View<T,A1,A2,A3> & v , + const size_t n0 = 0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 ) +{ + typedef Kokkos::Experimental::View<T,A1,A2,A3> view_type ; + + static_assert( Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::is_managed , "Can only realloc managed views" ); + + const std::string label = v.label(); + + v = view_type(); // Deallocate first, if the only view to allocation + v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 ); +} + +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + +namespace Kokkos { + +template< class D , class A1 = void , class A2 = void , class A3 = void > +using ViewTraits = Kokkos::Experimental::ViewTraits<D,A1,A2,A3> ; + +template< class D , class A1 = void , class A2 = void , class A3 = void , class S = void > +using View = Kokkos::Experimental::View<D,A1,A2,A3> ; + +using Kokkos::Experimental::deep_copy ; +using Kokkos::Experimental::create_mirror ; +using Kokkos::Experimental::create_mirror_view ; +using Kokkos::Experimental::subview ; +using Kokkos::Experimental::resize ; +using Kokkos::Experimental::realloc ; + +namespace Impl { + +using Kokkos::Experimental::is_view ; + +class ViewDefault {}; + +template< class SrcViewType + , class Arg0Type + , class Arg1Type + , class Arg2Type + , class Arg3Type + , class Arg4Type + , class Arg5Type + , class Arg6Type + , class Arg7Type + > +struct ViewSubview /* { typedef ... type ; } */ ; + +} + +} /* namespace Kokkos */ + +#include <impl/Kokkos_Atomic_View.hpp> + +#endif /* #if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif + diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp new file mode 100755 index 0000000000000000000000000000000000000000..60009e6d4dd66aa7d817e294203363797133664d --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp @@ -0,0 +1,285 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_Atomic.hpp +/// \brief Atomic functions +/// +/// This header file defines prototypes for the following atomic functions: +/// - exchange +/// - compare and exchange +/// - add +/// +/// Supported types include: +/// - signed and unsigned 4 and 8 byte integers +/// - float +/// - double +/// +/// They are implemented through GCC compatible intrinsics, OpenMP +/// directives and native CUDA intrinsics. +/// +/// Including this header file requires one of the following +/// compilers: +/// - NVCC (for CUDA device code only) +/// - GCC (for host code only) +/// - Intel (for host code only) +/// - A compiler that supports OpenMP 3.1 (for host code only) + +#ifndef KOKKOS_ATOMIC_HPP +#define KOKKOS_ATOMIC_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_HostSpace.hpp> +#include <impl/Kokkos_Traits.hpp> + +//---------------------------------------------------------------------------- +#if defined(_WIN32) +#define KOKKOS_ATOMICS_USE_WINDOWS +#else +#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA ) + +// Compiling NVIDIA device code, must use Cuda atomics: + +#define KOKKOS_ATOMICS_USE_CUDA + +#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \ + ! defined( KOKKOS_ATOMICS_USE_INTEL ) && \ + ! defined( KOKKOS_ATOMICS_USE_OMP31 ) + +// Compiling for non-Cuda atomic implementation has not been pre-selected. +// Choose the best implementation for the detected compiler. +// Preference: GCC, INTEL, OMP31 + +#if defined( KOKKOS_COMPILER_GNU ) || \ + defined( KOKKOS_COMPILER_CLANG ) || \ + ( defined ( KOKKOS_COMPILER_NVCC ) && defined ( __GNUC__ ) ) + +#define KOKKOS_ATOMICS_USE_GCC + +#elif defined( KOKKOS_COMPILER_INTEL ) || \ + defined( KOKKOS_COMPILER_CRAYC ) + +#define KOKKOS_ATOMICS_USE_INTEL + +#elif defined( _OPENMP ) && ( 201107 <= _OPENMP ) + +#define KOKKOS_ATOMICS_USE_OMP31 + +#else + +#error "KOKKOS_ATOMICS_USE : Unsupported compiler" + +#endif + +#endif /* Not pre-selected atomic implementation */ +#endif + +//---------------------------------------------------------------------------- + +// Forward decalaration of functions supporting arbitrary sized atomics +// This is necessary since Kokkos_Atomic.hpp is internally included very early +// through Kokkos_HostSpace.hpp as well as the allocation tracker. +#ifdef KOKKOS_HAVE_CUDA +namespace Kokkos { +namespace Impl { +/// \brief Aquire a lock for the address +/// +/// This function tries to aquire the lock for the hash value derived +/// from the provided ptr. If the lock is successfully aquired the +/// function returns true. Otherwise it returns false. +__device__ inline +bool lock_address_cuda_space(void* ptr); + +/// \brief Release lock for the address +/// +/// This function releases the lock for the hash value derived +/// from the provided ptr. This function should only be called +/// after previously successfully aquiring a lock with +/// lock_address. +__device__ inline +void unlock_address_cuda_space(void* ptr); +} +} +#endif + + +namespace Kokkos { +template <typename T> +KOKKOS_INLINE_FUNCTION +void atomic_add(volatile T * const dest, const T src); + +// Atomic increment +template<typename T> +KOKKOS_INLINE_FUNCTION +void atomic_increment(volatile T* a); + +template<typename T> +KOKKOS_INLINE_FUNCTION +void atomic_decrement(volatile T* a); +} + +#if ! defined(_WIN32) +#include<impl/Kokkos_Atomic_Assembly_X86.hpp> +#endif + +namespace Kokkos { + + +inline +const char * atomic_query_version() +{ +#if defined( KOKKOS_ATOMICS_USE_CUDA ) + return "KOKKOS_ATOMICS_USE_CUDA" ; +#elif defined( KOKKOS_ATOMICS_USE_GCC ) + return "KOKKOS_ATOMICS_USE_GCC" ; +#elif defined( KOKKOS_ATOMICS_USE_INTEL ) + return "KOKKOS_ATOMICS_USE_INTEL" ; +#elif defined( KOKKOS_ATOMICS_USE_OMP31 ) + return "KOKKOS_ATOMICS_USE_OMP31" ; +#elif defined( KOKKOS_ATOMICS_USE_WINDOWS ) + return "KOKKOS_ATOMICS_USE_WINDOWS"; +#endif +} + +} // namespace Kokkos + +#ifdef _WIN32 +#include "impl/Kokkos_Atomic_Windows.hpp" +#else +//#include "impl/Kokkos_Atomic_Assembly_X86.hpp" + +//---------------------------------------------------------------------------- +// Atomic exchange +// +// template< typename T > +// T atomic_exchange( volatile T* const dest , const T val ) +// { T tmp = *dest ; *dest = val ; return tmp ; } + +#include "impl/Kokkos_Atomic_Exchange.hpp" + +//---------------------------------------------------------------------------- +// Atomic compare-and-exchange +// +// template<class T> +// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val) +// { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } return equal ; } + +#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp" + +//---------------------------------------------------------------------------- +// Atomic fetch and add +// +// template<class T> +// T atomic_fetch_add(volatile T* const dest, const T val) +// { T tmp = *dest ; *dest += val ; return tmp ; } + +#include "impl/Kokkos_Atomic_Fetch_Add.hpp" + +//---------------------------------------------------------------------------- +// Atomic fetch and sub +// +// template<class T> +// T atomic_fetch_sub(volatile T* const dest, const T val) +// { T tmp = *dest ; *dest -= val ; return tmp ; } + +#include "impl/Kokkos_Atomic_Fetch_Sub.hpp" + +//---------------------------------------------------------------------------- +// Atomic fetch and or +// +// template<class T> +// T atomic_fetch_or(volatile T* const dest, const T val) +// { T tmp = *dest ; *dest = tmp | val ; return tmp ; } + +#include "impl/Kokkos_Atomic_Fetch_Or.hpp" + +//---------------------------------------------------------------------------- +// Atomic fetch and and +// +// template<class T> +// T atomic_fetch_and(volatile T* const dest, const T val) +// { T tmp = *dest ; *dest = tmp & val ; return tmp ; } + +#include "impl/Kokkos_Atomic_Fetch_And.hpp" +#endif /*Not _WIN32*/ + +//---------------------------------------------------------------------------- +// Memory fence +// +// All loads and stores from this thread will be globally consistent before continuing +// +// void memory_fence() {...}; +#include "impl/Kokkos_Memory_Fence.hpp" + +//---------------------------------------------------------------------------- +// Provide volatile_load and safe_load +// +// T volatile_load(T const volatile * const ptr); +// +// T const& safe_load(T const * const ptr); +// XEON PHI +// T safe_load(T const * const ptr + +#include "impl/Kokkos_Volatile_Load.hpp" + +#ifndef _WIN32 +#include "impl/Kokkos_Atomic_Generic.hpp" +#endif +//---------------------------------------------------------------------------- +// This atomic-style macro should be an inlined function, not a macro + +#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__) + + #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0) + #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0) + +#else + + #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0) + #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0) + +#endif + +//---------------------------------------------------------------------------- + +#endif /* KOKKOS_ATOMIC_HPP */ + diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp new file mode 100755 index 0000000000000000000000000000000000000000..c521e23159884744c21adb368f43247944c95e91 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -0,0 +1,228 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CORE_HPP +#define KOKKOS_CORE_HPP + +//---------------------------------------------------------------------------- +// Include the execution space header files for the enabled execution spaces. + +#include <Kokkos_Core_fwd.hpp> + +#if defined( KOKKOS_HAVE_CUDA ) +#include <Kokkos_Cuda.hpp> +#endif + +#if defined( KOKKOS_HAVE_OPENMP ) +#include <Kokkos_OpenMP.hpp> +#endif + +#if defined( KOKKOS_HAVE_SERIAL ) +#include <Kokkos_Serial.hpp> +#endif + +#if defined( KOKKOS_HAVE_PTHREAD ) +#include <Kokkos_Threads.hpp> +#endif + +#include <Kokkos_Pair.hpp> +#include <Kokkos_View.hpp> +#include <Kokkos_Vectorization.hpp> +#include <Kokkos_Atomic.hpp> +#include <Kokkos_hwloc.hpp> + +#include <iostream> + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +struct InitArguments { + int num_threads; + int num_numa; + int device_id; + + InitArguments() { + num_threads = -1; + num_numa = -1; + device_id = -1; + } +}; + +void initialize(int& narg, char* arg[]); + +void initialize(const InitArguments& args = InitArguments()); + +/** \brief Finalize the spaces that were initialized via Kokkos::initialize */ +void finalize(); + +/** \brief Finalize all known execution spaces */ +void finalize_all(); + +void fence(); + +} + +#ifdef KOKKOS_HAVE_CXX11 +namespace Kokkos { + +namespace Impl { +// should only by used by kokkos_malloc and kokkos_free +struct MallocHelper +{ + static void increment_ref_count( AllocationTracker const & tracker ) + { + tracker.increment_ref_count(); + } + + static void decrement_ref_count( AllocationTracker const & tracker ) + { + tracker.decrement_ref_count(); + } +}; +} // namespace Impl + +/* Allocate memory from a memory space. + * The allocation is tracked in Kokkos memory tracking system, so + * leaked memory can be identified. + */ +template< class Arg = DefaultExecutionSpace> +void* kokkos_malloc(const std::string label, size_t count) { + typedef typename Arg::memory_space MemorySpace; + Impl::AllocationTracker tracker = MemorySpace::allocate_and_track(label,count);; + Impl::MallocHelper::increment_ref_count( tracker ); + return tracker.alloc_ptr(); +} + +template< class Arg = DefaultExecutionSpace> +void* kokkos_malloc(const size_t& count) { + return kokkos_malloc<Arg>("DefaultLabel",count); +} + + +/* Free memory from a memory space. + */ +template< class Arg = DefaultExecutionSpace> +void kokkos_free(const void* ptr) { + typedef typename Arg::memory_space MemorySpace; + typedef typename MemorySpace::allocator allocator; + Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(ptr); + if (tracker.is_valid()) { + Impl::MallocHelper::decrement_ref_count( tracker ); + } +} + + +template< class Arg = DefaultExecutionSpace> +const void* kokkos_realloc(const void* old_ptr, size_t size) { + typedef typename Arg::memory_space MemorySpace; + typedef typename MemorySpace::allocator allocator; + Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr); + + tracker.reallocate(size); + + return tracker.alloc_ptr(); +} + +} // namespace Kokkos +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space > +inline +void * kokkos_malloc( const size_t arg_alloc_size ) +{ + typedef typename Space::memory_space MemorySpace ; + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ; + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ; + + RecordHost * const r = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size ); + + RecordBase::increment( r ); + + return r->data(); +} + +template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space > +inline +void kokkos_free( void * arg_alloc ) +{ + typedef typename Space::memory_space MemorySpace ; + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ; + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ; + + RecordHost * const r = RecordHost::get_record( arg_alloc ); + + RecordBase::decrement( r ); +} + +template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space > +inline +void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size ) +{ + typedef typename Space::memory_space MemorySpace ; + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ; + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ; + + RecordHost * const r_old = RecordHost::get_record( arg_alloc ); + RecordHost * const r_new = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size ); + + Kokkos::Impl::DeepCopy<MemorySpace,MemorySpace>( r_new->data() , r_old->data() + , std::min( r_old->size() , r_new->size() ) ); + + RecordBase::increment( r_new ); + RecordBase::decrement( r_old ); + + return r_new->data(); +} + +} // namespace Experimental +} // namespace Kokkos + +#endif + diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp new file mode 100755 index 0000000000000000000000000000000000000000..2cde9299a4d070abb41712a3a352f20bb1b81530 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -0,0 +1,170 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CORE_FWD_HPP +#define KOKKOS_CORE_FWD_HPP + +//---------------------------------------------------------------------------- +// Kokkos_Macros.hpp does introspection on configuration options +// and compiler environment then sets a collection of #define macros. + +#include <Kokkos_Macros.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// Forward declarations for class inter-relationships + +namespace Kokkos { + +class HostSpace ; ///< Memory space for main process and CPU execution spaces + +#if defined( KOKKOS_HAVE_SERIAL ) +class Serial ; ///< Execution space main process on CPU +#endif // defined( KOKKOS_HAVE_SERIAL ) + +#if defined( KOKKOS_HAVE_PTHREAD ) +class Threads ; ///< Execution space with pthreads back-end +#endif + +#if defined( KOKKOS_HAVE_OPENMP ) +class OpenMP ; ///< OpenMP execution space +#endif + +#if defined( KOKKOS_HAVE_CUDA ) +class CudaSpace ; ///< Memory space on Cuda GPU +class CudaUVMSpace ; ///< Memory space on Cuda GPU with UVM +class CudaHostPinnedSpace ; ///< Memory space on Host accessible to Cuda GPU +class Cuda ; ///< Execution space for Cuda GPU +#endif + +template<class ExecutionSpace, class MemorySpace> +struct Device; +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// Set the default execution space. + +/// Define Kokkos::DefaultExecutionSpace as per configuration option +/// or chosen from the enabled execution spaces in the following order: +/// Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial + +namespace Kokkos { + +#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) + typedef Cuda DefaultExecutionSpace ; +#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) + typedef OpenMP DefaultExecutionSpace ; +#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) + typedef Threads DefaultExecutionSpace ; +#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) + typedef Serial DefaultExecutionSpace ; +#else +# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads." +#endif + +#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) + typedef OpenMP DefaultHostExecutionSpace ; +#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) + typedef Threads DefaultHostExecutionSpace ; +#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) + typedef Serial DefaultHostExecutionSpace ; +#elif defined ( KOKKOS_HAVE_OPENMP ) + typedef OpenMP DefaultHostExecutionSpace ; +#elif defined ( KOKKOS_HAVE_PTHREAD ) + typedef Threads DefaultHostExecutionSpace ; +#elif defined ( KOKKOS_HAVE_SERIAL ) + typedef Serial DefaultHostExecutionSpace ; +#else +# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads." +#endif + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// Detect the active execution space and define its memory space. +// This is used to verify whether a running kernel can access +// a given memory space. + +namespace Kokkos { +namespace Impl { + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined (KOKKOS_HAVE_CUDA) +typedef Kokkos::CudaSpace ActiveExecutionMemorySpace ; +#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) +typedef Kokkos::HostSpace ActiveExecutionMemorySpace ; +#else +typedef void ActiveExecutionMemorySpace ; +#endif + +template< class ActiveSpace , class MemorySpace > +struct VerifyExecutionCanAccessMemorySpace { + enum {value = 0}; +}; + +template< class Space > +struct VerifyExecutionCanAccessMemorySpace< Space , Space > +{ + enum {value = 1}; + KOKKOS_INLINE_FUNCTION static void verify(void) {} + KOKKOS_INLINE_FUNCTION static void verify(const void *) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \ + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \ + Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR ) + +#define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \ + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \ + Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify() + +namespace Kokkos { + void fence(); +} + +#endif /* #ifndef KOKKOS_CORE_FWD_HPP */ + diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp new file mode 100755 index 0000000000000000000000000000000000000000..d736459b54ffffddf3b1a5f087cf8e55cb97b410 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp @@ -0,0 +1,268 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_HPP +#define KOKKOS_CUDA_HPP + +#include <Kokkos_Core_fwd.hpp> + +// If CUDA execution space is enabled then use this header file. + +#if defined( KOKKOS_HAVE_CUDA ) + +#include <iosfwd> +#include <vector> + +#include <Kokkos_CudaSpace.hpp> + +#include <Kokkos_Parallel.hpp> +#include <Kokkos_Layout.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <impl/Kokkos_Tags.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +class CudaExec ; +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/// \class Cuda +/// \brief Kokkos Execution Space that uses CUDA to run on GPUs. +/// +/// An "execution space" represents a parallel execution model. It tells Kokkos +/// how to parallelize the execution of kernels in a parallel_for or +/// parallel_reduce. For example, the Threads execution space uses Pthreads or +/// C++11 threads on a CPU, the OpenMP execution space uses the OpenMP language +/// extensions, and the Serial execution space executes "parallel" kernels +/// sequentially. The Cuda execution space uses NVIDIA's CUDA programming +/// model to execute kernels in parallel on GPUs. +class Cuda { +public: + //! \name Type declarations that all Kokkos execution spaces must provide. + //@{ + + //! Tag this class as a kokkos execution space + typedef Cuda execution_space ; + +#if defined( KOKKOS_USE_CUDA_UVM ) + //! This execution space's preferred memory space. + typedef CudaUVMSpace memory_space ; +#else + //! This execution space's preferred memory space. + typedef CudaSpace memory_space ; +#endif + + //! This execution space preferred device_type + typedef Kokkos::Device<execution_space,memory_space> device_type; + + //! The size_type best suited for this execution space. + typedef memory_space::size_type size_type ; + + //! This execution space's preferred array layout. + typedef LayoutLeft array_layout ; + + //! + typedef ScratchMemorySpace< Cuda > scratch_memory_space ; + + //@} + //-------------------------------------------------- + //! \name Functions that all Kokkos devices must implement. + //@{ + + /// \brief True if and only if this method is being called in a + /// thread-parallel function. + KOKKOS_INLINE_FUNCTION static int in_parallel() { +#if defined( __CUDA_ARCH__ ) + return true; +#else + return false; +#endif + } + + /** \brief Set the device in a "sleep" state. + * + * This function sets the device in a "sleep" state in which it is + * not ready for work. This may consume less resources than if the + * device were in an "awake" state, but it may also take time to + * bring the device from a sleep state to be ready for work. + * + * \return True if the device is in the "sleep" state, else false if + * the device is actively working and could not enter the "sleep" + * state. + */ + static bool sleep(); + + /// \brief Wake the device from the 'sleep' state so it is ready for work. + /// + /// \return True if the device is in the "ready" state, else "false" + /// if the device is actively working (which also means that it's + /// awake). + static bool wake(); + + /// \brief Wait until all dispatched functors complete. + /// + /// The parallel_for or parallel_reduce dispatch of a functor may + /// return asynchronously, before the functor completes. This + /// method does not return until all dispatched functors on this + /// device have completed. + static void fence(); + + //! Free any resources being consumed by the device. + static void finalize(); + + //! Has been initialized + static int is_initialized(); + + //! Print configuration information to the given output stream. + static void print_configuration( std::ostream & , const bool detail = false ); + + //@} + //-------------------------------------------------- + //! \name Cuda space instances + + ~Cuda() {} + Cuda(); + explicit Cuda( const int instance_id ); + + Cuda( const Cuda & ) = default ; + Cuda( Cuda && ) = default ; + Cuda & operator = ( const Cuda & ) = default ; + Cuda & operator = ( Cuda && ) = default ; + + //-------------------------------------------------------------------------- + //! \name Device-specific functions + //@{ + + struct SelectDevice { + int cuda_device_id ; + SelectDevice() : cuda_device_id(0) {} + explicit SelectDevice( int id ) : cuda_device_id( id ) {} + }; + + //! Initialize, telling the CUDA run-time library which device to use. + static void initialize( const SelectDevice = SelectDevice() + , const size_t num_instances = 1 ); + + /// \brief Cuda device architecture of the selected device. + /// + /// This matches the __CUDA_ARCH__ specification. + static size_type device_arch(); + + //! Query device count. + static size_type detect_device_count(); + + /** \brief Detect the available devices and their architecture + * as defined by the __CUDA_ARCH__ specification. + */ + static std::vector<unsigned> detect_device_arch(); + + cudaStream_t cuda_stream() const { return m_stream ; } + int cuda_device() const { return m_device ; } + + //@} + //-------------------------------------------------------------------------- + +private: + + cudaStream_t m_stream ; + int m_device ; +}; + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template<> +struct VerifyExecutionCanAccessMemorySpace + < Kokkos::CudaSpace + , Kokkos::Cuda::scratch_memory_space + > +{ + enum { value = true }; + KOKKOS_INLINE_FUNCTION static void verify( void ) { } + KOKKOS_INLINE_FUNCTION static void verify( const void * ) { } +}; + +template<> +struct VerifyExecutionCanAccessMemorySpace + < Kokkos::HostSpace + , Kokkos::Cuda::scratch_memory_space + > +{ + enum { value = false }; + inline static void verify( void ) { CudaSpace::access_error(); } + inline static void verify( const void * p ) { CudaSpace::access_error(p); } +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +#include <Cuda/Kokkos_CudaExec.hpp> +#include <Cuda/Kokkos_Cuda_View.hpp> + +#include <KokkosExp_View.hpp> +#include <Cuda/KokkosExp_Cuda_View.hpp> + +#include <Cuda/Kokkos_Cuda_Parallel.hpp> + +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_HAVE_CUDA ) */ +#endif /* #ifndef KOKKOS_CUDA_HPP */ + + + diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp new file mode 100755 index 0000000000000000000000000000000000000000..34915fd382a4bfb4e9b282d678624f93edff03d4 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp @@ -0,0 +1,656 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDASPACE_HPP +#define KOKKOS_CUDASPACE_HPP + +#include <Kokkos_Core_fwd.hpp> + +#if defined( KOKKOS_HAVE_CUDA ) + +#include <iosfwd> +#include <typeinfo> +#include <string> + +#include <Kokkos_HostSpace.hpp> + +#include <impl/Kokkos_AllocationTracker.hpp> + +#include <Cuda/Kokkos_Cuda_abort.hpp> +#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/** \brief Cuda on-device memory management */ + +class CudaSpace { +public: + + //! Tag this class as a kokkos memory space + typedef CudaSpace memory_space ; + typedef Kokkos::Cuda execution_space ; + typedef Kokkos::Device<execution_space,memory_space> device_type; + + typedef unsigned int size_type ; + + typedef Impl::CudaMallocAllocator allocator; + + /** \brief Allocate a contiguous block of memory. + * + * The input label is associated with the block of memory. + * The block of memory is tracked via reference counting where + * allocation gives it a reference count of one. + */ + static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size ); + + /*--------------------------------*/ + /** \brief Cuda specific function to attached texture object to an allocation. + * Output the texture object, base pointer, and offset from the input pointer. + */ +#if defined( __CUDACC__ ) + static void texture_object_attach( Impl::AllocationTracker const & tracker + , unsigned type_size + , ::cudaChannelFormatDesc const & desc + ); +#endif + + /*--------------------------------*/ + + CudaSpace(); + CudaSpace( const CudaSpace & rhs ) = default ; + CudaSpace & operator = ( const CudaSpace & rhs ) = default ; + ~CudaSpace() = default ; + + /**\brief Allocate memory in the cuda space */ + void * allocate( const size_t arg_alloc_size ) const ; + + /**\brief Deallocate memory in the cuda space */ + void deallocate( void * const arg_alloc_ptr + , const size_t arg_alloc_size ) const ; + + /*--------------------------------*/ + /** \brief Error reporting for HostSpace attempt to access CudaSpace */ + static void access_error(); + static void access_error( const void * const ); + +private: + + int m_device ; ///< Which Cuda device + + // friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ; +}; + +namespace Impl { +/// \brief Initialize lock array for arbitrary size atomics. +/// +/// Arbitrary atomics are implemented using a hash table of locks +/// where the hash value is derived from the address of the +/// object for which an atomic operation is performed. +/// This function initializes the locks to zero (unset). +void init_lock_array_cuda_space(); + +/// \brief Retrieve the pointer to the lock array for arbitrary size atomics. +/// +/// Arbitrary atomics are implemented using a hash table of locks +/// where the hash value is derived from the address of the +/// object for which an atomic operation is performed. +/// This function retrieves the lock array pointer. +/// If the array is not yet allocated it will do so. +int* lock_array_cuda_space_ptr(bool deallocate = false); +} +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/** \brief Cuda memory that is accessible to Host execution space + * through Cuda's unified virtual memory (UVM) runtime. + */ +class CudaUVMSpace { +public: + + //! Tag this class as a kokkos memory space + typedef CudaUVMSpace memory_space ; + typedef Cuda execution_space ; + typedef Kokkos::Device<execution_space,memory_space> device_type; + typedef unsigned int size_type ; + + /** \brief If UVM capability is available */ + static bool available(); + + typedef Impl::CudaUVMAllocator allocator; + + /** \brief Allocate a contiguous block of memory. + * + * The input label is associated with the block of memory. + * The block of memory is tracked via reference counting where + * allocation gives it a reference count of one. + */ + static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size ); + + + /** \brief Cuda specific function to attached texture object to an allocation. + * Output the texture object, base pointer, and offset from the input pointer. + */ +#if defined( __CUDACC__ ) + static void texture_object_attach( Impl::AllocationTracker const & tracker + , unsigned type_size + , ::cudaChannelFormatDesc const & desc + ); +#endif + /*--------------------------------*/ + + CudaUVMSpace(); + CudaUVMSpace( const CudaUVMSpace & rhs ) = default ; + CudaUVMSpace & operator = ( const CudaUVMSpace & rhs ) = default ; + ~CudaUVMSpace() = default ; + + /**\brief Allocate memory in the cuda space */ + void * allocate( const size_t arg_alloc_size ) const ; + + /**\brief Deallocate memory in the cuda space */ + void deallocate( void * const arg_alloc_ptr + , const size_t arg_alloc_size ) const ; + + /*--------------------------------*/ + +private: + + int m_device ; ///< Which Cuda device +}; + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/** \brief Host memory that is accessible to Cuda execution space + * through Cuda's host-pinned memory allocation. + */ +class CudaHostPinnedSpace { +public: + + //! Tag this class as a kokkos memory space + /** \brief Memory is in HostSpace so use the HostSpace::execution_space */ + typedef HostSpace::execution_space execution_space ; + typedef CudaHostPinnedSpace memory_space ; + typedef Kokkos::Device<execution_space,memory_space> device_type; + typedef unsigned int size_type ; + + + typedef Impl::CudaHostAllocator allocator ; + + /** \brief Allocate a contiguous block of memory. + * + * The input label is associated with the block of memory. + * The block of memory is tracked via reference counting where + * allocation gives it a reference count of one. + */ + static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size ); + + /*--------------------------------*/ + + CudaHostPinnedSpace(); + CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ; + CudaHostPinnedSpace & operator = ( const CudaHostPinnedSpace & rhs ) = default ; + ~CudaHostPinnedSpace() = default ; + + /**\brief Allocate memory in the cuda space */ + void * allocate( const size_t arg_alloc_size ) const ; + + /**\brief Deallocate memory in the cuda space */ + void deallocate( void * const arg_alloc_ptr + , const size_t arg_alloc_size ) const ; + + /*--------------------------------*/ +}; + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template<> struct DeepCopy< CudaSpace , CudaSpace > +{ + DeepCopy( void * dst , const void * src , size_t ); + DeepCopy( const Cuda & , void * dst , const void * src , size_t ); +}; + +template<> struct DeepCopy< CudaSpace , HostSpace > +{ + DeepCopy( void * dst , const void * src , size_t ); + DeepCopy( const Cuda & , void * dst , const void * src , size_t ); +}; + +template<> struct DeepCopy< HostSpace , CudaSpace > +{ + DeepCopy( void * dst , const void * src , size_t ); + DeepCopy( const Cuda & , void * dst , const void * src , size_t ); +}; + +template<> struct DeepCopy< CudaSpace , CudaUVMSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); } +}; + +template<> struct DeepCopy< CudaSpace , CudaHostPinnedSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); } +}; + + +template<> struct DeepCopy< CudaUVMSpace , CudaSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); } +}; + +template<> struct DeepCopy< CudaUVMSpace , CudaUVMSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); } +}; + +template<> struct DeepCopy< CudaUVMSpace , CudaHostPinnedSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); } +}; + +template<> struct DeepCopy< CudaUVMSpace , HostSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); } +}; + + +template<> struct DeepCopy< CudaHostPinnedSpace , CudaSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); } +}; + +template<> struct DeepCopy< CudaHostPinnedSpace , CudaUVMSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); } +}; + +template<> struct DeepCopy< CudaHostPinnedSpace , CudaHostPinnedSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); } +}; + +template<> struct DeepCopy< CudaHostPinnedSpace , HostSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); } +}; + + +template<> struct DeepCopy< HostSpace , CudaUVMSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); } +}; + +template<> struct DeepCopy< HostSpace , CudaHostPinnedSpace > +{ + inline + DeepCopy( void * dst , const void * src , size_t n ) + { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** Running in CudaSpace attempting to access HostSpace: error */ +template<> +struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::HostSpace > +{ + enum { value = false }; + KOKKOS_INLINE_FUNCTION static void verify( void ) + { Kokkos::abort("Cuda code attempted to access HostSpace memory"); } + + KOKKOS_INLINE_FUNCTION static void verify( const void * ) + { Kokkos::abort("Cuda code attempted to access HostSpace memory"); } +}; + +/** Running in CudaSpace accessing CudaUVMSpace: ok */ +template<> +struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaUVMSpace > +{ + enum { value = true }; + KOKKOS_INLINE_FUNCTION static void verify( void ) { } + KOKKOS_INLINE_FUNCTION static void verify( const void * ) { } +}; + +/** Running in CudaSpace accessing CudaHostPinnedSpace: ok */ +template<> +struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace > +{ + enum { value = true }; + KOKKOS_INLINE_FUNCTION static void verify( void ) { } + KOKKOS_INLINE_FUNCTION static void verify( const void * ) { } +}; + +/** Running in CudaSpace attempting to access an unknown space: error */ +template< class OtherSpace > +struct VerifyExecutionCanAccessMemorySpace< + typename enable_if< ! is_same<Kokkos::CudaSpace,OtherSpace>::value , Kokkos::CudaSpace >::type , + OtherSpace > +{ + enum { value = false }; + KOKKOS_INLINE_FUNCTION static void verify( void ) + { Kokkos::abort("Cuda code attempted to access unknown Space memory"); } + + KOKKOS_INLINE_FUNCTION static void verify( const void * ) + { Kokkos::abort("Cuda code attempted to access unknown Space memory"); } +}; + +//---------------------------------------------------------------------------- +/** Running in HostSpace attempting to access CudaSpace */ +template<> +struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaSpace > +{ + enum { value = false }; + inline static void verify( void ) { CudaSpace::access_error(); } + inline static void verify( const void * p ) { CudaSpace::access_error(p); } +}; + +/** Running in HostSpace accessing CudaUVMSpace is OK */ +template<> +struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaUVMSpace > +{ + enum { value = true }; + inline static void verify( void ) { } + inline static void verify( const void * ) { } +}; + +/** Running in HostSpace accessing CudaHostPinnedSpace is OK */ +template<> +struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace > +{ + enum { value = true }; + KOKKOS_INLINE_FUNCTION static void verify( void ) {} + KOKKOS_INLINE_FUNCTION static void verify( const void * ) {} +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template<> +class SharedAllocationRecord< Kokkos::CudaSpace , void > + : public SharedAllocationRecord< void , void > +{ +private: + + friend class SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ; + + typedef SharedAllocationRecord< void , void > RecordBase ; + + SharedAllocationRecord( const SharedAllocationRecord & ) = delete ; + SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ; + + static void deallocate( RecordBase * ); + + static ::cudaTextureObject_t + attach_texture_object( const unsigned sizeof_alias + , void * const alloc_ptr + , const size_t alloc_size ); + + static RecordBase s_root_record ; + + ::cudaTextureObject_t m_tex_obj ; + const Kokkos::CudaSpace m_space ; + +protected: + + ~SharedAllocationRecord(); + SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {} + + SharedAllocationRecord( const Kokkos::CudaSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + , const RecordBase::function_type arg_dealloc = & deallocate + ); + +public: + + std::string get_label() const ; + + static SharedAllocationRecord * allocate( const Kokkos::CudaSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + ); + + template< typename AliasType > + inline + ::cudaTextureObject_t attach_texture_object() + { + static_assert( ( std::is_same< AliasType , int >::value || + std::is_same< AliasType , ::int2 >::value || + std::is_same< AliasType , ::int4 >::value ) + , "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" ); + + if ( m_tex_obj == 0 ) { + m_tex_obj = attach_texture_object( sizeof(AliasType) + , (void*) RecordBase::m_alloc_ptr + , RecordBase::m_alloc_size ); + } + + return m_tex_obj ; + } + + template< typename AliasType > + inline + int attach_texture_object_offset( const AliasType * const ptr ) + { + // Texture object is attached to the entire allocation range + return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr ); + } + + static SharedAllocationRecord * get_record( void * arg_alloc_ptr ); + + static void print_records( std::ostream & , const Kokkos::CudaSpace & , bool detail = false ); +}; + + +template<> +class SharedAllocationRecord< Kokkos::CudaUVMSpace , void > + : public SharedAllocationRecord< void , void > +{ +private: + + typedef SharedAllocationRecord< void , void > RecordBase ; + + SharedAllocationRecord( const SharedAllocationRecord & ) = delete ; + SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ; + + static void deallocate( RecordBase * ); + + static RecordBase s_root_record ; + + ::cudaTextureObject_t m_tex_obj ; + const Kokkos::CudaUVMSpace m_space ; + +protected: + + ~SharedAllocationRecord(); + SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {} + + SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + , const RecordBase::function_type arg_dealloc = & deallocate + ); + +public: + + std::string get_label() const ; + + static SharedAllocationRecord * allocate( const Kokkos::CudaUVMSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + ); + + template< typename AliasType > + inline + ::cudaTextureObject_t attach_texture_object() + { + static_assert( ( std::is_same< AliasType , int >::value || + std::is_same< AliasType , ::int2 >::value || + std::is_same< AliasType , ::int4 >::value ) + , "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" ); + + if ( m_tex_obj == 0 ) { + m_tex_obj = SharedAllocationRecord< Kokkos::CudaSpace , void >:: + attach_texture_object( sizeof(AliasType) + , (void*) RecordBase::m_alloc_ptr + , RecordBase::m_alloc_size ); + } + + return m_tex_obj ; + } + + template< typename AliasType > + inline + int attach_texture_object_offset( const AliasType * const ptr ) + { + // Texture object is attached to the entire allocation range + return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr ); + } + + static SharedAllocationRecord * get_record( void * arg_alloc_ptr ); + + static void print_records( std::ostream & , const Kokkos::CudaUVMSpace & , bool detail = false ); +}; + +template<> +class SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > + : public SharedAllocationRecord< void , void > +{ +private: + + typedef SharedAllocationRecord< void , void > RecordBase ; + + SharedAllocationRecord( const SharedAllocationRecord & ) = delete ; + SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ; + + static void deallocate( RecordBase * ); + + static RecordBase s_root_record ; + + const Kokkos::CudaHostPinnedSpace m_space ; + +protected: + + ~SharedAllocationRecord(); + SharedAllocationRecord() : RecordBase(), m_space() {} + + SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + , const RecordBase::function_type arg_dealloc = & deallocate + ); + +public: + + std::string get_label() const ; + + static SharedAllocationRecord * allocate( const Kokkos::CudaHostPinnedSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + ); + + static SharedAllocationRecord * get_record( void * arg_alloc_ptr ); + + static void print_records( std::ostream & , const Kokkos::CudaHostPinnedSpace & , bool detail = false ); +}; + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_HAVE_CUDA ) */ +#endif /* #define KOKKOS_CUDASPACE_HPP */ + diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp new file mode 100755 index 0000000000000000000000000000000000000000..807cb5cb435d5be51456492a7f8b0559d55d3382 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -0,0 +1,497 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXECPOLICY_HPP +#define KOKKOS_EXECPOLICY_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_StaticAssert.hpp> +#include <impl/Kokkos_Tags.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \brief Execution policy for work over a range of an integral type. + * + * Valid template argument options: + * + * With a specified execution space: + * < ExecSpace , WorkTag , { IntConst | IntType } > + * < ExecSpace , WorkTag , void > + * < ExecSpace , { IntConst | IntType } , void > + * < ExecSpace , void , void > + * + * With the default execution space: + * < WorkTag , { IntConst | IntType } , void > + * < WorkTag , void , void > + * < { IntConst | IntType } , void , void > + * < void , void , void > + * + * IntType is a fundamental integral type + * IntConst is an Impl::integral_constant< IntType , Blocking > + * + * Blocking is the granularity of partitioning the range among threads. + */ +template< class Arg0 = void , class Arg1 = void , class Arg2 = void + , class ExecSpace = + // The first argument is the execution space, + // otherwise use the default execution space. + typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0 + , Kokkos::DefaultExecutionSpace >::type + > +class RangePolicy { +private: + + // Default integral type and blocking factor: + typedef int DefaultIntType ; + enum { DefaultIntValue = 8 }; + + enum { Arg0_Void = Impl::is_same< Arg0 , void >::value }; + enum { Arg1_Void = Impl::is_same< Arg1 , void >::value }; + enum { Arg2_Void = Impl::is_same< Arg2 , void >::value }; + + enum { Arg0_ExecSpace = Impl::is_execution_space< Arg0 >::value }; + + enum { Arg0_IntConst = Impl::is_integral_constant< Arg0 >::value }; + enum { Arg1_IntConst = Impl::is_integral_constant< Arg1 >::value }; + enum { Arg2_IntConst = Impl::is_integral_constant< Arg2 >::value }; + + enum { Arg0_IntType = Impl::is_integral< Arg0 >::value }; + enum { Arg1_IntType = Impl::is_integral< Arg1 >::value }; + enum { Arg2_IntType = Impl::is_integral< Arg2 >::value }; + + enum { Arg0_WorkTag = ! Arg0_ExecSpace && ! Arg0_IntConst && ! Arg0_IntType && ! Arg0_Void }; + enum { Arg1_WorkTag = Arg0_ExecSpace && ! Arg1_IntConst && ! Arg1_IntType && ! Arg1_Void }; + + enum { ArgOption_OK = Impl::StaticAssert< ( + ( Arg0_ExecSpace && Arg1_WorkTag && ( Arg2_IntConst || Arg2_IntType ) ) || + ( Arg0_ExecSpace && Arg1_WorkTag && Arg2_Void ) || + ( Arg0_ExecSpace && ( Arg1_IntConst || Arg1_IntType ) && Arg2_Void ) || + ( Arg0_ExecSpace && Arg1_Void && Arg2_Void ) || + ( Arg0_WorkTag && ( Arg1_IntConst || Arg1_IntType ) && Arg2_Void ) || + ( Arg0_WorkTag && Arg1_Void && Arg2_Void ) || + ( ( Arg0_IntConst || Arg0_IntType ) && Arg1_Void && Arg2_Void ) || + ( Arg0_Void && Arg1_Void && Arg2_Void ) + ) >::value }; + + // The work argument tag is the first or second argument + typedef typename Impl::if_c< Arg0_WorkTag , Arg0 , + typename Impl::if_c< Arg1_WorkTag , Arg1 , void + >::type >::type + WorkTag ; + + enum { Granularity = Arg0_IntConst ? unsigned(Impl::is_integral_constant<Arg0>::integral_value) : ( + Arg1_IntConst ? unsigned(Impl::is_integral_constant<Arg1>::integral_value) : ( + Arg2_IntConst ? unsigned(Impl::is_integral_constant<Arg2>::integral_value) : ( + unsigned(DefaultIntValue) ))) }; + + // Only accept the integral type if the blocking is a power of two + typedef typename Impl::enable_if< Impl::is_power_of_two< Granularity >::value , + typename Impl::if_c< Arg0_IntType , Arg0 , + typename Impl::if_c< Arg1_IntType , Arg1 , + typename Impl::if_c< Arg2_IntType , Arg2 , + typename Impl::if_c< Arg0_IntConst , typename Impl::is_integral_constant<Arg0>::integral_type , + typename Impl::if_c< Arg1_IntConst , typename Impl::is_integral_constant<Arg1>::integral_type , + typename Impl::if_c< Arg2_IntConst , typename Impl::is_integral_constant<Arg2>::integral_type , + DefaultIntType + >::type >::type >::type + >::type >::type >::type + >::type + IntType ; + + enum { GranularityMask = IntType(Granularity) - 1 }; + + ExecSpace m_space ; + IntType m_begin ; + IntType m_end ; + +public: + + //! Tag this class as an execution policy + typedef ExecSpace execution_space ; + typedef RangePolicy execution_policy ; + typedef WorkTag work_tag ; + typedef IntType member_type ; + + KOKKOS_INLINE_FUNCTION const execution_space & space() const { return m_space ; } + KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; } + KOKKOS_INLINE_FUNCTION member_type end() const { return m_end ; } + + inline RangePolicy() : m_space(), m_begin(0), m_end(0) {} + + /** \brief Total range */ + inline + RangePolicy( const member_type work_begin + , const member_type work_end + ) + : m_space() + , m_begin( work_begin < work_end ? work_begin : 0 ) + , m_end( work_begin < work_end ? work_end : 0 ) + {} + + /** \brief Total range */ + inline + RangePolicy( const execution_space & work_space + , const member_type work_begin + , const member_type work_end + ) + : m_space( work_space ) + , m_begin( work_begin < work_end ? work_begin : 0 ) + , m_end( work_begin < work_end ? work_end : 0 ) + {} + + /** \brief Subrange for a partition's rank and size. + * + * Typically used to partition a range over a group of threads. + */ + struct WorkRange { + typedef RangePolicy::work_tag work_tag ; + typedef RangePolicy::member_type member_type ; + + KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; } + KOKKOS_INLINE_FUNCTION member_type end() const { return m_end ; } + + /** \brief Subrange for a partition's rank and size. + * + * Typically used to partition a range over a group of threads. + */ + KOKKOS_INLINE_FUNCTION + WorkRange( const RangePolicy & range + , const int part_rank + , const int part_size + ) + : m_begin(0), m_end(0) + { + if ( part_size ) { + + // Split evenly among partitions, then round up to the granularity. + const member_type work_part = + ( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size ) + + GranularityMask ) & ~member_type(GranularityMask); + + m_begin = range.begin() + work_part * part_rank ; + m_end = m_begin + work_part ; + + if ( range.end() < m_begin ) m_begin = range.end() ; + if ( range.end() < m_end ) m_end = range.end() ; + } + } + private: + member_type m_begin ; + member_type m_end ; + WorkRange(); + WorkRange & operator = ( const WorkRange & ); + }; +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \brief Execution policy for parallel work over a league of teams of threads. + * + * The work functor is called for each thread of each team such that + * the team's member threads are guaranteed to be concurrent. + * + * The team's threads have access to team shared scratch memory and + * team collective operations. + * + * If the WorkTag is non-void then the first calling argument of the + * work functor's parentheses operator is 'const WorkTag &'. + * This allows a functor to have multiple work member functions. + * + * template argument option with specified execution space: + * < ExecSpace , WorkTag > + * < ExecSpace , void > + * + * template argument option with default execution space: + * < WorkTag , void > + * < void , void > + */ +template< class Arg0 = void + , class Arg1 = void + , class ExecSpace = + // If the first argument is not an execution + // then use the default execution space. + typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0 + , Kokkos::DefaultExecutionSpace >::type + > +class TeamPolicy { +private: + + enum { Arg0_ExecSpace = Impl::is_execution_space< Arg0 >::value }; + enum { Arg1_Void = Impl::is_same< Arg1 , void >::value }; + enum { ArgOption_OK = Impl::StaticAssert< ( Arg0_ExecSpace || Arg1_Void ) >::value }; + + typedef typename Impl::if_c< Arg0_ExecSpace , Arg1 , Arg0 >::type WorkTag ; + +public: + + //! Tag this class as an execution policy + typedef TeamPolicy execution_policy ; + typedef ExecSpace execution_space ; + typedef WorkTag work_tag ; + + //---------------------------------------- + /** \brief Query maximum team size for a given functor. + * + * This size takes into account execution space concurrency limitations and + * scratch memory space limitations for reductions, team reduce/scan, and + * team shared memory. + */ + template< class FunctorType > + static int team_size_max( const FunctorType & ); + + /** \brief Query recommended team size for a given functor. + * + * This size takes into account execution space concurrency limitations and + * scratch memory space limitations for reductions, team reduce/scan, and + * team shared memory. + */ + template< class FunctorType > + static int team_size_recommended( const FunctorType & ); + + template< class FunctorType > + static int team_size_recommended( const FunctorType & , const int&); + //---------------------------------------- + /** \brief Construct policy with the given instance of the execution space */ + TeamPolicy( const execution_space & , int league_size_request , int team_size_request ); + + /** \brief Construct policy with the default instance of the execution space */ + TeamPolicy( int league_size_request , int team_size_request ); + + /** \brief The actual league size (number of teams) of the policy. + * + * This may be smaller than the requested league size due to limitations + * of the execution space. + */ + KOKKOS_INLINE_FUNCTION int league_size() const ; + + /** \brief The actual team size (number of threads per team) of the policy. + * + * This may be smaller than the requested team size due to limitations + * of the execution space. + */ + KOKKOS_INLINE_FUNCTION int team_size() const ; + + /** \brief Parallel execution of a functor calls the functor once with + * each member of the execution policy. + */ + struct member_type { + + /** \brief Handle to the currently executing team shared scratch memory */ + KOKKOS_INLINE_FUNCTION + typename execution_space::scratch_memory_space team_shmem() const ; + + /** \brief Rank of this team within the league of teams */ + KOKKOS_INLINE_FUNCTION int league_rank() const ; + + /** \brief Number of teams in the league */ + KOKKOS_INLINE_FUNCTION int league_size() const ; + + /** \brief Rank of this thread within this team */ + KOKKOS_INLINE_FUNCTION int team_rank() const ; + + /** \brief Number of threads in this team */ + KOKKOS_INLINE_FUNCTION int team_size() const ; + + /** \brief Barrier among the threads of this team */ + KOKKOS_INLINE_FUNCTION void team_barrier() const ; + + /** \brief Intra-team reduction. Returns join of all values of the team members. */ + template< class JoinOp > + KOKKOS_INLINE_FUNCTION + typename JoinOp::value_type team_reduce( const typename JoinOp::value_type + , const JoinOp & ) const ; + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template< typename Type > + KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const ; + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template< typename Type > + KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const ; + }; +}; + +} // namespace Kokkos + +namespace Kokkos { + +namespace Impl { + +template<typename iType, class TeamMemberType> +struct TeamThreadRangeBoundariesStruct { +private: + + KOKKOS_INLINE_FUNCTION static + iType ibegin( const iType & arg_begin + , const iType & arg_end + , const iType & arg_rank + , const iType & arg_size + ) + { + return arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * arg_rank ; + } + + KOKKOS_INLINE_FUNCTION static + iType iend( const iType & arg_begin + , const iType & arg_end + , const iType & arg_rank + , const iType & arg_size + ) + { + const iType end_ = arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * ( arg_rank + 1 ); + return end_ < arg_end ? end_ : arg_end ; + } + +public: + + typedef iType index_type; + const iType start; + const iType end; + enum {increment = 1}; + const TeamMemberType& thread; + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread + , const iType& arg_end + ) + : start( ibegin( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) ) + , end( iend( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) ) + , thread( arg_thread ) + {} + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread + , const iType& arg_begin + , const iType& arg_end + ) + : start( ibegin( arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) ) + , end( iend( arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) ) + , thread( arg_thread ) + {} +}; + + template<typename iType, class TeamMemberType> + struct ThreadVectorRangeBoundariesStruct { + typedef iType index_type; + enum {start = 0}; + const iType end; + enum {increment = 1}; + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct (const TeamMemberType& thread, const iType& count): + end( count ) + {} + }; + + template<class TeamMemberType> + struct ThreadSingleStruct { + const TeamMemberType& team_member; + KOKKOS_INLINE_FUNCTION + ThreadSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){} + }; + + template<class TeamMemberType> + struct VectorSingleStruct { + const TeamMemberType& team_member; + KOKKOS_INLINE_FUNCTION + VectorSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){} + }; +} // namespace Impl + +/** \brief Execution policy for parallel work over a threads within a team. + * + * The range is split over all threads in a team. The Mapping scheme depends on the architecture. + * This policy is used together with a parallel pattern as a nested layer within a kernel launched + * with the TeamPolicy. This variant expects a single count. So the range is (0,count]. + */ +template<typename iType, class TeamMemberType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& count); + +/** \brief Execution policy for parallel work over a threads within a team. + * + * The range is split over all threads in a team. The Mapping scheme depends on the architecture. + * This policy is used together with a parallel pattern as a nested layer within a kernel launched + * with the TeamPolicy. This variant expects a begin and end. So the range is (begin,end]. + */ +template<typename iType, class TeamMemberType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& begin, const iType& end); + +/** \brief Execution policy for a vector parallel loop. + * + * The range is split over all vector lanes in a thread. The Mapping scheme depends on the architecture. + * This policy is used together with a parallel pattern as a nested layer within a kernel launched + * with the TeamPolicy. This variant expects a single count. So the range is (0,count]. + */ +template<typename iType, class TeamMemberType> +KOKKOS_INLINE_FUNCTION +Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(const TeamMemberType&, const iType& count); + +} // namespace Kokkos + +#endif /* #define KOKKOS_EXECPOLICY_HPP */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp new file mode 100755 index 0000000000000000000000000000000000000000..012743d43ce31af7f1be6a91b9aafa951241b6be --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -0,0 +1,270 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HOSTSPACE_HPP +#define KOKKOS_HOSTSPACE_HPP + +#include <cstring> +#include <string> +#include <iosfwd> +#include <typeinfo> + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_MemoryTraits.hpp> + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Error.hpp> + +#include <impl/Kokkos_AllocationTracker.hpp> +#include <impl/Kokkos_BasicAllocators.hpp> + +#include <impl/KokkosExp_SharedAlloc.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +/// \brief Initialize lock array for arbitrary size atomics. +/// +/// Arbitrary atomics are implemented using a hash table of locks +/// where the hash value is derived from the address of the +/// object for which an atomic operation is performed. +/// This function initializes the locks to zero (unset). +void init_lock_array_host_space(); + +/// \brief Aquire a lock for the address +/// +/// This function tries to aquire the lock for the hash value derived +/// from the provided ptr. If the lock is successfully aquired the +/// function returns true. Otherwise it returns false. +bool lock_address_host_space(void* ptr); + +/// \brief Release lock for the address +/// +/// This function releases the lock for the hash value derived +/// from the provided ptr. This function should only be called +/// after previously successfully aquiring a lock with +/// lock_address. +void unlock_address_host_space(void* ptr); + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { + +/// \class HostSpace +/// \brief Memory management for host memory. +/// +/// HostSpace is a memory space that governs host memory. "Host" +/// memory means the usual CPU-accessible memory. +class HostSpace { +public: + + //! Tag this class as a kokkos memory space + typedef HostSpace memory_space ; + typedef size_t size_type ; + + /// \typedef execution_space + /// \brief Default execution space for this memory space. + /// + /// Every memory space has a default execution space. This is + /// useful for things like initializing a View (which happens in + /// parallel using the View's default execution space). +#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) + typedef Kokkos::OpenMP execution_space ; +#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) + typedef Kokkos::Threads execution_space ; +#elif defined( KOKKOS_HAVE_OPENMP ) + typedef Kokkos::OpenMP execution_space ; +#elif defined( KOKKOS_HAVE_PTHREAD ) + typedef Kokkos::Threads execution_space ; +#elif defined( KOKKOS_HAVE_SERIAL ) + typedef Kokkos::Serial execution_space ; +#else +# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices." +#endif + + //! This memory space preferred device_type + typedef Kokkos::Device<execution_space,memory_space> device_type; + + +#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY ) + typedef Impl::PageAlignedAllocator allocator ; +#else + typedef Impl::AlignedAllocator allocator ; +#endif + + /** \brief Allocate a contiguous block of memory. + * + * The input label is associated with the block of memory. + * The block of memory is tracked via reference counting where + * allocation gives it a reference count of one. + */ + static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size ); + + /*--------------------------------*/ + /* Functions unique to the HostSpace */ + static int in_parallel(); + + static void register_in_parallel( int (*)() ); + + /*--------------------------------*/ + + /**\brief Default memory space instance */ + HostSpace(); + HostSpace( const HostSpace & rhs ) = default ; + HostSpace & operator = ( const HostSpace & ) = default ; + ~HostSpace() = default ; + + /**\brief Non-default memory space instance to choose allocation mechansim, if available */ + + enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC }; + + explicit + HostSpace( const AllocationMechanism & ); + + /**\brief Allocate memory in the host space */ + void * allocate( const size_t arg_alloc_size ) const ; + + /**\brief Deallocate memory in the host space */ + void deallocate( void * const arg_alloc_ptr + , const size_t arg_alloc_size ) const ; + +private: + + AllocationMechanism m_alloc_mech ; + + friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ; +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template<> +class SharedAllocationRecord< Kokkos::HostSpace , void > + : public SharedAllocationRecord< void , void > +{ +private: + + friend Kokkos::HostSpace ; + + typedef SharedAllocationRecord< void , void > RecordBase ; + + SharedAllocationRecord( const SharedAllocationRecord & ) = delete ; + SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ; + + static void deallocate( RecordBase * ); + + /**\brief Root record for tracked allocations from this HostSpace instance */ + static RecordBase s_root_record ; + + const Kokkos::HostSpace m_space ; + +protected: + + ~SharedAllocationRecord(); + SharedAllocationRecord() = default ; + + SharedAllocationRecord( const Kokkos::HostSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + , const RecordBase::function_type arg_dealloc = & deallocate + ); + +public: + + inline + std::string get_label() const + { + return std::string( RecordBase::head()->m_label ); + } + + KOKKOS_INLINE_FUNCTION static + SharedAllocationRecord * allocate( const Kokkos::HostSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + ) + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size ); +#else + return (SharedAllocationRecord *) 0 ; +#endif + } + + + static SharedAllocationRecord * get_record( void * arg_alloc_ptr ); + + static void print_records( std::ostream & , const Kokkos::HostSpace & , bool detail = false ); +}; + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class , class > struct DeepCopy ; + +template<> +struct DeepCopy<HostSpace,HostSpace> { + DeepCopy( void * dst , const void * src , size_t n ); +}; + +} // namespace Impl +} // namespace Kokkos + + +#endif /* #define KOKKOS_HOSTSPACE_HPP */ + diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp new file mode 100755 index 0000000000000000000000000000000000000000..32822889df28cb7c928d3bf99184249d3cb2748d --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Layout.hpp @@ -0,0 +1,174 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_Layout.hpp +/// \brief Declaration of various \c MemoryLayout options. + +#ifndef KOKKOS_LAYOUT_HPP +#define KOKKOS_LAYOUT_HPP + +#include <stddef.h> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Tags.hpp> + +namespace Kokkos { + +//---------------------------------------------------------------------------- +/// \struct LayoutLeft +/// \brief Memory layout tag indicating left-to-right (Fortran scheme) +/// striding of multi-indices. +/// +/// This is an example of a \c MemoryLayout template parameter of +/// View. The memory layout describes how View maps from a +/// multi-index (i0, i1, ..., ik) to a memory location. +/// +/// "Layout left" indicates a mapping where the leftmost index i0 +/// refers to contiguous access, and strides increase for dimensions +/// going right from there (i1, i2, ...). This layout imitates how +/// Fortran stores multi-dimensional arrays. For the special case of +/// a two-dimensional array, "layout left" is also called "column +/// major." +struct LayoutLeft { + //! Tag this class as a kokkos array layout + typedef LayoutLeft array_layout ; +}; + +//---------------------------------------------------------------------------- +/// \struct LayoutRight +/// \brief Memory layout tag indicating right-to-left (C or +/// lexigraphical scheme) striding of multi-indices. +/// +/// This is an example of a \c MemoryLayout template parameter of +/// View. The memory layout describes how View maps from a +/// multi-index (i0, i1, ..., ik) to a memory location. +/// +/// "Right layout" indicates a mapping where the rightmost index ik +/// refers to contiguous access, and strides increase for dimensions +/// going left from there. This layout imitates how C stores +/// multi-dimensional arrays. For the special case of a +/// two-dimensional array, "layout right" is also called "row major." +struct LayoutRight { + //! Tag this class as a kokkos array layout + typedef LayoutRight array_layout ; +}; + +//---------------------------------------------------------------------------- +/// \struct LayoutStride +/// \brief Memory layout tag indicated arbitrarily strided +/// multi-index mapping into contiguous memory. +struct LayoutStride { + + //! Tag this class as a kokkos array layout + typedef LayoutStride array_layout ; + + enum { MAX_RANK = 8 }; + + size_t dimension[ MAX_RANK ] ; + size_t stride[ MAX_RANK ] ; + + /** \brief Compute strides from ordered dimensions. + * + * Values of order uniquely form the set [0..rank) + * and specify ordering of the dimensions. + * Order = {0,1,2,...} is LayoutLeft + * Order = {...,2,1,0} is LayoutRight + */ + template< typename iTypeOrder , typename iTypeDimen > + KOKKOS_INLINE_FUNCTION static + LayoutStride order_dimensions( int const rank + , iTypeOrder const * const order + , iTypeDimen const * const dimen ) + { + LayoutStride tmp ; + // Verify valid rank order: + int check_input = MAX_RANK < rank ? 0 : int( 1 << rank ) - 1 ; + for ( int r = 0 ; r < MAX_RANK ; ++r ) { + tmp.dimension[r] = 0 ; + tmp.stride[r] = 0 ; + check_input &= ~int( 1 << order[r] ); + } + if ( 0 == check_input ) { + size_t n = 1 ; + for ( int r = 0 ; r < rank ; ++r ) { + tmp.stride[ order[r] ] = n ; + n *= ( dimen[order[r]] ); + tmp.dimension[r] = dimen[r]; + } + } + return tmp ; + } +}; + +//---------------------------------------------------------------------------- +/// \struct LayoutTileLeft +/// \brief Memory layout tag indicating left-to-right (Fortran scheme) +/// striding of multi-indices by tiles. +/// +/// This is an example of a \c MemoryLayout template parameter of +/// View. The memory layout describes how View maps from a +/// multi-index (i0, i1, ..., ik) to a memory location. +/// +/// "Tiled layout" indicates a mapping to contiguously stored +/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two +/// dimensions. Indices are LayoutLeft within each tile, and the +/// tiles themselves are arranged using LayoutLeft. Note that the +/// dimensions <tt>ArgN0</tt> and <tt>ArgN1</tt> of the tiles must be +/// compile-time constants. This speeds up index calculations. If +/// both tile dimensions are powers of two, Kokkos can optimize +/// further. +template < unsigned ArgN0 , unsigned ArgN1 , + bool IsPowerOfTwo = ( Impl::is_power_of_two<ArgN0>::value && + Impl::is_power_of_two<ArgN1>::value ) + > +struct LayoutTileLeft { + //! Tag this class as a kokkos array layout + typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ; + + enum { N0 = ArgN0 }; + enum { N1 = ArgN1 }; +}; + +} // namespace Kokkos + +#endif // #ifndef KOKKOS_LAYOUT_HPP + diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp new file mode 100755 index 0000000000000000000000000000000000000000..3978a0622865d89d5f56ddb0a5f641969ed99223 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -0,0 +1,397 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_MACROS_HPP +#define KOKKOS_MACROS_HPP + +//---------------------------------------------------------------------------- +/** Pick up configure/build options via #define macros: + * + * KOKKOS_HAVE_CUDA Kokkos::Cuda execution and memory spaces + * KOKKOS_HAVE_PTHREAD Kokkos::Threads execution space + * KOKKOS_HAVE_QTHREAD Kokkos::Qthread execution space + * KOKKOS_HAVE_OPENMP Kokkos::OpenMP execution space + * KOKKOS_HAVE_HWLOC HWLOC library is available + * KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK insert array bounds checks, is expensive! + * KOKKOS_HAVE_CXX11 enable C++11 features + * + * KOKKOS_HAVE_MPI negotiate MPI/execution space interactions + * + * KOKKOS_USE_CUDA_UVM Use CUDA UVM for Cuda memory space + */ + +#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H +#include <KokkosCore_config.h> +#endif + +//---------------------------------------------------------------------------- +/** Pick up compiler specific #define macros: + * + * Macros for known compilers evaluate to an integral version value + * + * KOKKOS_COMPILER_NVCC + * KOKKOS_COMPILER_GNU + * KOKKOS_COMPILER_INTEL + * KOKKOS_COMPILER_IBM + * KOKKOS_COMPILER_CRAYC + * KOKKOS_COMPILER_APPLECC + * KOKKOS_COMPILER_CLANG + * KOKKOS_COMPILER_PGI + * + * Macros for which compiler extension to use for atomics on intrinsice types + * + * KOKKOS_ATOMICS_USE_CUDA + * KOKKOS_ATOMICS_USE_GNU + * KOKKOS_ATOMICS_USE_INTEL + * KOKKOS_ATOMICS_USE_OPENMP31 + * + * A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use. + * + * Macros for marking functions to run in an execution space: + * + * KOKKOS_FUNCTION + * KOKKOS_INLINE_FUNCTION request compiler to inline + * KOKKOS_FORCEINLINE_FUNCTION force compiler to inline, use with care! + */ + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) + +/* Compiling with a CUDA compiler. + * + * Include <cuda.h> to pick up the CUDA_VERSION macro defined as: + * CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 ) + * + * When generating device code the __CUDA_ARCH__ macro is defined as: + * __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 ) + */ + +#include <cuda_runtime.h> +#include <cuda.h> + +#if ! defined( CUDA_VERSION ) +#error "#include <cuda.h> did not define CUDA_VERSION" +#endif + +#if ( CUDA_VERSION < 6050 ) +// CUDA supports (inofficially) C++11 in device code starting with +// version 6.5. This includes auto type and device code internal +// lambdas. +#error "Cuda version 6.5 or greater required" +#endif + +#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 ) +/* Compiling with CUDA compiler for device code. */ +#error "Cuda device capability >= 3.0 is required" +#endif + +#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */ + +/*--------------------------------------------------------------------------*/ +/* Language info: C++, CUDA, OPENMP */ + +#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA ) + // Compiling Cuda code to 'ptx' + + #define KOKKOS_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ + #define KOKKOS_INLINE_FUNCTION __device__ __host__ inline + #define KOKKOS_FUNCTION __device__ __host__ + +#endif /* #if defined( __CUDA_ARCH__ ) */ + +#if defined( _OPENMP ) + + /* Compiling with OpenMP. + * The value of _OPENMP is an integer value YYYYMM + * where YYYY and MM are the year and month designation + * of the supported OpenMP API version. + */ + +#endif /* #if defined( _OPENMP ) */ + +/*--------------------------------------------------------------------------*/ +/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */ + +#if defined( __NVCC__ ) + // NVIDIA compiler is being used. + // Code is parsed and separated into host and device code. + // Host code is compiled again with another compiler. + // Device code is compile to 'ptx'. + #define KOKKOS_COMPILER_NVCC __NVCC__ + +#else +#if defined( KOKKOS_HAVE_CXX11 ) && ! defined( KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA ) + // CUDA (including version 6.5) does not support giving lambdas as + // arguments to global functions. Thus its not currently possible + // to dispatch lambdas from the host. + #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1 + #endif +#endif /* #if defined( __NVCC__ ) */ + +#if defined( KOKKOS_HAVE_CXX11 ) && !defined (KOKKOS_LAMBDA) + #define KOKKOS_LAMBDA [=] +#endif + +#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */ + +/* Intel compiler for host code */ + +#if defined( __INTEL_COMPILER ) + #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER +#elif defined( __ICC ) + // Old define + #define KOKKOS_COMPILER_INTEL __ICC +#elif defined( __ECC ) + // Very old define + #define KOKKOS_COMPILER_INTEL __ECC +#endif + +/* CRAY compiler for host code */ +#if defined( _CRAYC ) + #define KOKKOS_COMPILER_CRAYC _CRAYC +#endif + +#if defined( __IBMCPP__ ) + // IBM C++ + #define KOKKOS_COMPILER_IBM __IBMCPP__ +#elif defined( __IBMC__ ) + #define KOKKOS_COMPILER_IBM __IBMC__ +#endif + +#if defined( __APPLE_CC__ ) + #define KOKKOS_COMPILER_APPLECC __APPLE_CC__ +#endif + +#if defined (__clang__) && !defined (KOKKOS_COMPILER_INTEL) + #define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__ +#endif + +#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ ) + #define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__ +#endif + +#if defined( __PGIC__ ) && ! defined( __GNUC__ ) + #define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__ +#endif + +#endif /* #if ! defined( __CUDA_ARCH__ ) */ + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ +/* Intel compiler macros */ + +#if defined( KOKKOS_COMPILER_INTEL ) + + #define KOKKOS_HAVE_PRAGMA_UNROLL 1 + #define KOKKOS_HAVE_PRAGMA_IVDEP 1 + #define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1 + #define KOKKOS_HAVE_PRAGMA_VECTOR 1 + #define KOKKOS_HAVE_PRAGMA_SIMD 1 + +#if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 ) + #define KOKKOS_ENABLE_ASM 1 + #endif + + #if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_FORCEINLINE_FUNCTION ) + #if !defined (_WIN32) + #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline)) + #else + #define KOKKOS_FORCEINLINE_FUNCTION inline + #endif + #endif + + #if defined( __MIC__ ) + // Compiling for Xeon Phi + #endif + +#endif + +/*--------------------------------------------------------------------------*/ +/* Cray compiler macros */ + +#if defined( KOKKOS_COMPILER_CRAYC ) + + +#endif + +/*--------------------------------------------------------------------------*/ +/* IBM Compiler macros */ + +#if defined( KOKKOS_COMPILER_IBM ) + + #define KOKKOS_HAVE_PRAGMA_UNROLL 1 + //#define KOKKOS_HAVE_PRAGMA_IVDEP 1 + //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1 + //#define KOKKOS_HAVE_PRAGMA_VECTOR 1 + //#define KOKKOS_HAVE_PRAGMA_SIMD 1 + +#endif + +/*--------------------------------------------------------------------------*/ +/* CLANG compiler macros */ + +#if defined( KOKKOS_COMPILER_CLANG ) + + //#define KOKKOS_HAVE_PRAGMA_UNROLL 1 + //#define KOKKOS_HAVE_PRAGMA_IVDEP 1 + //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1 + //#define KOKKOS_HAVE_PRAGMA_VECTOR 1 + //#define KOKKOS_HAVE_PRAGMA_SIMD 1 + + #if ! defined( KOKKOS_FORCEINLINE_FUNCTION ) + #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline)) + #endif + +#endif + +/*--------------------------------------------------------------------------*/ +/* GNU Compiler macros */ + +#if defined( KOKKOS_COMPILER_GNU ) + + //#define KOKKOS_HAVE_PRAGMA_UNROLL 1 + //#define KOKKOS_HAVE_PRAGMA_IVDEP 1 + //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1 + //#define KOKKOS_HAVE_PRAGMA_VECTOR 1 + //#define KOKKOS_HAVE_PRAGMA_SIMD 1 + + #if ! defined( KOKKOS_FORCEINLINE_FUNCTION ) + #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline)) + #endif + + #if ! defined( KOKKOS_ENABLE_ASM ) && \ + ! ( defined( __powerpc) || \ + defined(__powerpc__) || \ + defined(__powerpc64__) || \ + defined(__POWERPC__) || \ + defined(__ppc__) || \ + defined(__ppc64__) || \ + defined(__PGIC__) ) + #define KOKKOS_ENABLE_ASM 1 + #endif + +#endif + +/*--------------------------------------------------------------------------*/ + +#if defined( KOKKOS_COMPILER_PGI ) + + #define KOKKOS_HAVE_PRAGMA_UNROLL 1 + #define KOKKOS_HAVE_PRAGMA_IVDEP 1 + //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1 + #define KOKKOS_HAVE_PRAGMA_VECTOR 1 + //#define KOKKOS_HAVE_PRAGMA_SIMD 1 + +#endif + +/*--------------------------------------------------------------------------*/ + +#if defined( KOKKOS_COMPILER_NVCC ) + + #if defined(__CUDA_ARCH__ ) + #define KOKKOS_HAVE_PRAGMA_UNROLL 1 + #endif + +#endif + +//---------------------------------------------------------------------------- +/** Define function marking macros if compiler specific macros are undefined: */ + +#if ! defined( KOKKOS_FORCEINLINE_FUNCTION ) +#define KOKKOS_FORCEINLINE_FUNCTION inline +#endif + +#if ! defined( KOKKOS_INLINE_FUNCTION ) +#define KOKKOS_INLINE_FUNCTION inline +#endif + +#if ! defined( KOKKOS_FUNCTION ) +#define KOKKOS_FUNCTION /**/ +#endif + +//---------------------------------------------------------------------------- +/** Determine the default execution space for parallel dispatch. + * There is zero or one default execution space specified. + */ + +#if 1 < ( ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \ + ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \ + ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \ + ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) ) + +#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ; + +#endif + +/** If default is not specified then chose from enabled execution spaces. + * Priority: CUDA, OPENMP, THREADS, SERIAL + */ +#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) +#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) +#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) +#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) +#elif defined ( KOKKOS_HAVE_CUDA ) +#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA +#elif defined ( KOKKOS_HAVE_OPENMP ) +#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP +#elif defined ( KOKKOS_HAVE_PTHREAD ) +#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS +#else +#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL +#endif + +//---------------------------------------------------------------------------- +/** Determine for what space the code is being compiled: */ + +#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined (KOKKOS_HAVE_CUDA) +#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA +#else +#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_MACROS_HPP */ + diff --git a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp new file mode 100755 index 0000000000000000000000000000000000000000..b581c7da23fa0652521ee0d59a510c0769de7312 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp @@ -0,0 +1,116 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_MEMORYTRAITS_HPP +#define KOKKOS_MEMORYTRAITS_HPP + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Tags.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \brief Memory access traits for views, an extension point. + * + * These traits should be orthogonal. If there are dependencies then + * the MemoryTraits template must detect and enforce dependencies. + * + * A zero value is the default for a View, indicating that none of + * these traits are present. + */ +enum MemoryTraitsFlags + { Unmanaged = 0x01 + , RandomAccess = 0x02 + , Atomic = 0x04 + }; + +template < unsigned T > +struct MemoryTraits { + //! Tag this class as a kokkos memory traits: + typedef MemoryTraits memory_traits ; + + enum { Unmanaged = T & unsigned(Kokkos::Unmanaged) }; + enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) }; + enum { Atomic = T & unsigned(Kokkos::Atomic) }; + +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +typedef Kokkos::MemoryTraits<0> MemoryManaged ; +typedef Kokkos::MemoryTraits< Kokkos::Unmanaged > MemoryUnmanaged ; +typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryRandomAccess ; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief Memory alignment settings + * + * Sets global value for memory alignment. Must be a power of two! + * Enable compatibility of views from different devices with static stride. + * Use compiler flag to enable overwrites. + */ +enum { MEMORY_ALIGNMENT = +#if defined( KOKKOS_MEMORY_ALIGNMENT ) + ( 1 << Kokkos::Impl::power_of_two< KOKKOS_MEMORY_ALIGNMENT >::value ) +#else + ( 1 << Kokkos::Impl::power_of_two< 128 >::value ) +#endif + , MEMORY_ALIGNMENT_THRESHOLD = 4 + }; + + +} //namespace Impl +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */ + diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp new file mode 100755 index 0000000000000000000000000000000000000000..508da04c87ad7b9ea459b8ca1dde8f310587c59e --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp @@ -0,0 +1,175 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMP_HPP +#define KOKKOS_OPENMP_HPP + +#include <Kokkos_Core_fwd.hpp> + +#if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP ) + +#include <omp.h> + +#include <cstddef> +#include <iosfwd> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <Kokkos_Parallel.hpp> +#include <Kokkos_Layout.hpp> +#include <impl/Kokkos_Tags.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/// \class OpenMP +/// \brief Kokkos device for multicore processors in the host memory space. +class OpenMP { +public: + //------------------------------------ + //! \name Type declarations that all Kokkos devices must provide. + //@{ + + //! Tag this class as a kokkos execution space + typedef OpenMP execution_space ; + typedef HostSpace memory_space ; + //! This execution space preferred device_type + typedef Kokkos::Device<execution_space,memory_space> device_type; + + typedef LayoutRight array_layout ; + typedef HostSpace::size_type size_type ; + + typedef ScratchMemorySpace< OpenMP > scratch_memory_space ; + + //@} + //------------------------------------ + //! \name Functions that all Kokkos devices must implement. + //@{ + + inline static bool in_parallel() { return omp_in_parallel(); } + + /** \brief Set the device in a "sleep" state. A noop for OpenMP. */ + static bool sleep(); + + /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */ + static bool wake(); + + /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ + static void fence() {} + + /// \brief Print configuration information to the given output stream. + static void print_configuration( std::ostream & , const bool detail = false ); + + /// \brief Free any resources being consumed by the device. + static void finalize(); + + /** \brief Initialize the device. + * + * 1) If the hardware locality library is enabled and OpenMP has not + * already bound threads then bind OpenMP threads to maximize + * core utilization and group for memory hierarchy locality. + * + * 2) Allocate a HostThread for each OpenMP thread to hold its + * topology and fan in/out data. + */ + static void initialize( unsigned thread_count = 0 , + unsigned use_numa_count = 0 , + unsigned use_cores_per_numa = 0 ); + + static int is_initialized(); + //@} + //------------------------------------ + /** \brief This execution space has a topological thread pool which can be queried. + * + * All threads within a pool have a common memory space for which they are cache coherent. + * depth = 0 gives the number of threads in the whole pool. + * depth = 1 gives the number of threads in a NUMA region, typically sharing L3 cache. + * depth = 2 gives the number of threads at the finest granularity, typically sharing L1 cache. + */ + inline static int thread_pool_size( int depth = 0 ); + + /** \brief The rank of the executing thread in this thread pool */ + KOKKOS_INLINE_FUNCTION static int thread_pool_rank(); + + //------------------------------------ + + inline static unsigned max_hardware_threads() { return thread_pool_size(0); } + + KOKKOS_INLINE_FUNCTION static + unsigned hardware_thread_id() { return thread_pool_rank(); } +}; + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template<> +struct VerifyExecutionCanAccessMemorySpace + < Kokkos::OpenMP::memory_space + , Kokkos::OpenMP::scratch_memory_space + > +{ + enum { value = true }; + inline static void verify( void ) { } + inline static void verify( const void * ) { } +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +#include <OpenMP/Kokkos_OpenMPexec.hpp> +#include <OpenMP/Kokkos_OpenMP_Parallel.hpp> + +/*--------------------------------------------------------------------------*/ + +#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP ) */ +#endif /* #ifndef KOKKOS_OPENMP_HPP */ + + diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp new file mode 100755 index 0000000000000000000000000000000000000000..52de637a56dcf4e47ed1a6791a407f7d465eff17 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Pair.hpp @@ -0,0 +1,498 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +/// \file Kokkos_Pair.hpp +/// \brief Declaration and definition of Kokkos::pair. +/// +/// This header file declares and defines Kokkos::pair and its related +/// nonmember functions. + +#ifndef KOKKOS_PAIR_HPP +#define KOKKOS_PAIR_HPP + +#include <Kokkos_Macros.hpp> +#include <utility> + +namespace Kokkos { +/// \struct pair +/// \brief Replacement for std::pair that works on CUDA devices. +/// +/// The instance methods of std::pair, including its constructors, are +/// not marked as <tt>__device__</tt> functions. Thus, they cannot be +/// called on a CUDA device, such as an NVIDIA GPU. This struct +/// implements the same interface as std::pair, but can be used on a +/// CUDA device as well as on the host. +template <class T1, class T2> +struct pair +{ + //! The first template parameter of this class. + typedef T1 first_type; + //! The second template parameter of this class. + typedef T2 second_type; + + //! The first element of the pair. + first_type first; + //! The second element of the pair. + second_type second; + + /// \brief Default constructor. + /// + /// This calls the default constructors of T1 and T2. It won't + /// compile if those default constructors are not defined and + /// public. + KOKKOS_FORCEINLINE_FUNCTION + pair() + : first(), second() + {} + + /// \brief Constructor that takes both elements of the pair. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + KOKKOS_FORCEINLINE_FUNCTION + pair(first_type const& f, second_type const& s) + : first(f), second(s) + {} + + /// \brief Copy constructor. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION + pair( const pair<U,V> &p) + : first(p.first), second(p.second) + {} + + /// \brief Assignment operator. + /// + /// This calls the assignment operators of T1 and T2. It won't + /// compile if the assignment operators are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION + pair<T1, T2> & operator=(const pair<U,V> &p) + { + first = p.first; + second = p.second; + return *this; + } + + // from std::pair<U,V> + template <class U, class V> + pair( const std::pair<U,V> &p) + : first(p.first), second(p.second) + {} + + /// \brief Return the std::pair version of this object. + /// + /// This is <i>not</i> a device function; you may not call it on a + /// CUDA device. It is meant to be called on the host, if the user + /// wants an std::pair instead of a Kokkos::pair. + /// + /// \note This is not a conversion operator, since defining a + /// conversion operator made the relational operators have + /// ambiguous definitions. + std::pair<T1,T2> to_std_pair() const + { return std::make_pair(first,second); } +}; + +template <class T1, class T2> +struct pair<T1&, T2&> +{ + //! The first template parameter of this class. + typedef T1& first_type; + //! The second template parameter of this class. + typedef T2& second_type; + + //! The first element of the pair. + first_type first; + //! The second element of the pair. + second_type second; + + /// \brief Constructor that takes both elements of the pair. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + KOKKOS_FORCEINLINE_FUNCTION + pair(first_type f, second_type s) + : first(f), second(s) + {} + + /// \brief Copy constructor. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION + pair( const pair<U,V> &p) + : first(p.first), second(p.second) + {} + + // from std::pair<U,V> + template <class U, class V> + pair( const std::pair<U,V> &p) + : first(p.first), second(p.second) + {} + + /// \brief Assignment operator. + /// + /// This calls the assignment operators of T1 and T2. It won't + /// compile if the assignment operators are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION + pair<first_type, second_type> & operator=(const pair<U,V> &p) + { + first = p.first; + second = p.second; + return *this; + } + + /// \brief Return the std::pair version of this object. + /// + /// This is <i>not</i> a device function; you may not call it on a + /// CUDA device. It is meant to be called on the host, if the user + /// wants an std::pair instead of a Kokkos::pair. + /// + /// \note This is not a conversion operator, since defining a + /// conversion operator made the relational operators have + /// ambiguous definitions. + std::pair<T1,T2> to_std_pair() const + { return std::make_pair(first,second); } +}; + +template <class T1, class T2> +struct pair<T1, T2&> +{ + //! The first template parameter of this class. + typedef T1 first_type; + //! The second template parameter of this class. + typedef T2& second_type; + + //! The first element of the pair. + first_type first; + //! The second element of the pair. + second_type second; + + /// \brief Constructor that takes both elements of the pair. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + KOKKOS_FORCEINLINE_FUNCTION + pair(first_type const& f, second_type s) + : first(f), second(s) + {} + + /// \brief Copy constructor. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION + pair( const pair<U,V> &p) + : first(p.first), second(p.second) + {} + + // from std::pair<U,V> + template <class U, class V> + pair( const std::pair<U,V> &p) + : first(p.first), second(p.second) + {} + + /// \brief Assignment operator. + /// + /// This calls the assignment operators of T1 and T2. It won't + /// compile if the assignment operators are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION + pair<first_type, second_type> & operator=(const pair<U,V> &p) + { + first = p.first; + second = p.second; + return *this; + } + + /// \brief Return the std::pair version of this object. + /// + /// This is <i>not</i> a device function; you may not call it on a + /// CUDA device. It is meant to be called on the host, if the user + /// wants an std::pair instead of a Kokkos::pair. + /// + /// \note This is not a conversion operator, since defining a + /// conversion operator made the relational operators have + /// ambiguous definitions. + std::pair<T1,T2> to_std_pair() const + { return std::make_pair(first,second); } +}; + +template <class T1, class T2> +struct pair<T1&, T2> +{ + //! The first template parameter of this class. + typedef T1& first_type; + //! The second template parameter of this class. + typedef T2 second_type; + + //! The first element of the pair. + first_type first; + //! The second element of the pair. + second_type second; + + /// \brief Constructor that takes both elements of the pair. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + KOKKOS_FORCEINLINE_FUNCTION + pair(first_type f, second_type const& s) + : first(f), second(s) + {} + + /// \brief Copy constructor. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION + pair( const pair<U,V> &p) + : first(p.first), second(p.second) + {} + + // from std::pair<U,V> + template <class U, class V> + pair( const std::pair<U,V> &p) + : first(p.first), second(p.second) + {} + + /// \brief Assignment operator. + /// + /// This calls the assignment operators of T1 and T2. It won't + /// compile if the assignment operators are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION + pair<first_type, second_type> & operator=(const pair<U,V> &p) + { + first = p.first; + second = p.second; + return *this; + } + + /// \brief Return the std::pair version of this object. + /// + /// This is <i>not</i> a device function; you may not call it on a + /// CUDA device. It is meant to be called on the host, if the user + /// wants an std::pair instead of a Kokkos::pair. + /// + /// \note This is not a conversion operator, since defining a + /// conversion operator made the relational operators have + /// ambiguous definitions. + std::pair<T1,T2> to_std_pair() const + { return std::make_pair(first,second); } +}; + +//! Equality operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION +bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs) +{ return lhs.first==rhs.first && lhs.second==rhs.second; } + +//! Inequality operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION +bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs) +{ return !(lhs==rhs); } + +//! Less-than operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION +bool operator< (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs) +{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); } + +//! Less-than-or-equal-to operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION +bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs) +{ return !(rhs<lhs); } + +//! Greater-than operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION +bool operator> (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs) +{ return rhs<lhs; } + +//! Greater-than-or-equal-to operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION +bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs) +{ return !(lhs<rhs); } + +/// \brief Return a new pair. +/// +/// This is a "nonmember constructor" for Kokkos::pair. It works just +/// like std::make_pair. +template <class T1,class T2> +KOKKOS_FORCEINLINE_FUNCTION +pair<T1,T2> make_pair (T1 x, T2 y) +{ return ( pair<T1,T2>(x,y) ); } + +/// \brief Return a pair of references to the input arguments. +/// +/// This compares to std::tie (new in C++11). You can use it to +/// assign to two variables at once, from the result of a function +/// that returns a pair. For example (<tt>__device__</tt> and +/// <tt>__host__</tt> attributes omitted for brevity): +/// \code +/// // Declaration of the function to call. +/// // First return value: operation count. +/// // Second return value: whether all operations succeeded. +/// Kokkos::pair<int, bool> someFunction (); +/// +/// // Code that uses Kokkos::tie. +/// int myFunction () { +/// int count = 0; +/// bool success = false; +/// +/// // This assigns to both count and success. +/// Kokkos::tie (count, success) = someFunction (); +/// +/// if (! success) { +/// // ... Some operation failed; +/// // take corrective action ... +/// } +/// return count; +/// } +/// \endcode +/// +/// The line that uses tie() could have been written like this: +/// \code +/// Kokkos::pair<int, bool> result = someFunction (); +/// count = result.first; +/// success = result.second; +/// \endcode +/// +/// Using tie() saves two lines of code and avoids a copy of each +/// element of the pair. The latter could be significant if one or +/// both elements of the pair are more substantial objects than \c int +/// or \c bool. +template <class T1,class T2> +KOKKOS_FORCEINLINE_FUNCTION +pair<T1 &,T2 &> tie (T1 & x, T2 & y) +{ return ( pair<T1 &,T2 &>(x,y) ); } + +// +// Specialization of Kokkos::pair for a \c void second argument. This +// is not actually a "pair"; it only contains one element, the first. +// +template <class T1> +struct pair<T1,void> +{ + typedef T1 first_type; + typedef void second_type; + + first_type first; + enum { second = 0 }; + + KOKKOS_FORCEINLINE_FUNCTION + pair() + : first() + {} + + KOKKOS_FORCEINLINE_FUNCTION + pair(const first_type & f) + : first(f) + {} + + KOKKOS_FORCEINLINE_FUNCTION + pair(const first_type & f, int) + : first(f) + {} + + template <class U> + KOKKOS_FORCEINLINE_FUNCTION + pair( const pair<U,void> &p) + : first(p.first) + {} + + template <class U> + KOKKOS_FORCEINLINE_FUNCTION + pair<T1, void> & operator=(const pair<U,void> &p) + { + first = p.first; + return *this; + } +}; + +// +// Specialization of relational operators for Kokkos::pair<T1,void>. +// + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION +bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs) +{ return lhs.first==rhs.first; } + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION +bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs) +{ return !(lhs==rhs); } + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION +bool operator< (const pair<T1,void>& lhs, const pair<T1,void>& rhs) +{ return lhs.first<rhs.first; } + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION +bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs) +{ return !(rhs<lhs); } + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION +bool operator> (const pair<T1,void>& lhs, const pair<T1,void>& rhs) +{ return rhs<lhs; } + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION +bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs) +{ return !(lhs<rhs); } + +} // namespace Kokkos + + +#endif //KOKKOS_PAIR_HPP diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp new file mode 100755 index 0000000000000000000000000000000000000000..d714485e70d7726eef027e7c56c3722e65881582 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp @@ -0,0 +1,908 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_Parallel.hpp +/// \brief Declaration of parallel operators + +#ifndef KOKKOS_PARALLEL_HPP +#define KOKKOS_PARALLEL_HPP + +#include <cstddef> +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_View.hpp> +#include <Kokkos_ExecPolicy.hpp> + +#ifdef KOKKOSP_ENABLE_PROFILING +#include <impl/Kokkos_Profiling_Interface.hpp> +#include <typeinfo> +#endif + +#include <impl/Kokkos_AllocationTracker.hpp> +#include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> + +#ifdef KOKKOS_HAVE_DEBUG +#include<iostream> +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +/** \brief Given a Functor and Execution Policy query an execution space. + * + * if the Policy has an execution space use that + * else if the Functor has an execution_space use that + * else if the Functor has a device_type use that for backward compatibility + * else use the default + */ +template< class Functor + , class Policy + , class EnableFunctor = void + , class EnablePolicy = void + > +struct FunctorPolicyExecutionSpace { + typedef Kokkos::DefaultExecutionSpace execution_space ; +}; + +template< class Functor , class Policy > +struct FunctorPolicyExecutionSpace + < Functor , Policy + , typename enable_if_type< typename Functor::device_type >::type + , typename enable_if_type< typename Policy ::execution_space >::type + > +{ + typedef typename Policy ::execution_space execution_space ; +}; + +template< class Functor , class Policy > +struct FunctorPolicyExecutionSpace + < Functor , Policy + , typename enable_if_type< typename Functor::execution_space >::type + , typename enable_if_type< typename Policy ::execution_space >::type + > +{ + typedef typename Policy ::execution_space execution_space ; +}; + +template< class Functor , class Policy , class EnableFunctor > +struct FunctorPolicyExecutionSpace + < Functor , Policy + , EnableFunctor + , typename enable_if_type< typename Policy::execution_space >::type + > +{ + typedef typename Policy ::execution_space execution_space ; +}; + +template< class Functor , class Policy , class EnablePolicy > +struct FunctorPolicyExecutionSpace + < Functor , Policy + , typename enable_if_type< typename Functor::device_type >::type + , EnablePolicy + > +{ + typedef typename Functor::device_type execution_space ; +}; + +template< class Functor , class Policy , class EnablePolicy > +struct FunctorPolicyExecutionSpace + < Functor , Policy + , typename enable_if_type< typename Functor::execution_space >::type + , EnablePolicy + > +{ + typedef typename Functor::execution_space execution_space ; +}; + +//---------------------------------------------------------------------------- +/// \class ParallelFor +/// \brief Implementation of the ParallelFor operator that has a +/// partial specialization for the device. +/// +/// This is an implementation detail of parallel_for. Users should +/// skip this and go directly to the nonmember function parallel_for. +template< class FunctorType , class ExecPolicy > class ParallelFor ; + +/// \class ParallelReduce +/// \brief Implementation detail of parallel_reduce. +/// +/// This is an implementation detail of parallel_reduce. Users should +/// skip this and go directly to the nonmember function parallel_reduce. +template< class FunctorType , class ExecPolicy > class ParallelReduce ; + +/// \class ParallelScan +/// \brief Implementation detail of parallel_scan. +/// +/// This is an implementation detail of parallel_scan. Users should +/// skip this and go directly to the documentation of the nonmember +/// template function Kokkos::parallel_scan. +template< class FunctorType , class ExecPolicy > class ParallelScan ; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \brief Execute \c functor in parallel according to the execution \c policy. + * + * A "functor" is a class containing the function to execute in parallel, + * data needed for that execution, and an optional \c execution_space + * typedef. Here is an example functor for parallel_for: + * + * \code + * class FunctorType { + * public: + * typedef ... execution_space ; + * void operator() ( WorkType iwork ) const ; + * }; + * \endcode + * + * In the above example, \c WorkType is any integer type for which a + * valid conversion from \c size_t to \c IntType exists. Its + * <tt>operator()</tt> method defines the operation to parallelize, + * over the range of integer indices <tt>iwork=[0,work_count-1]</tt>. + * This compares to a single iteration \c iwork of a \c for loop. + * If \c execution_space is not defined DefaultExecutionSpace will be used. + */ +template< class ExecPolicy , class FunctorType > +inline +void parallel_for( const ExecPolicy & policy + , const FunctorType & functor + , const std::string& str = "" + , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0 + ) +{ +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelFor< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelFor(kpID); + } +#endif +} + +template< class FunctorType > +inline +void parallel_for( const size_t work_count + , const FunctorType & functor + , const std::string& str = "" + ) +{ + typedef typename + Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space + execution_space ; + typedef RangePolicy< execution_space > policy ; + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelFor< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelFor(kpID); + } +#endif +} + +template< class ExecPolicy , class FunctorType > +inline +void parallel_for( const std::string & str + , const ExecPolicy & policy + , const FunctorType & functor ) +{ + #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl; + #endif + + parallel_for(policy,functor,str); + + #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG End parallel_for kernel: " << str << std::endl; + #endif + (void) str; +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +/** \brief Parallel reduction + * + * Example of a parallel_reduce functor for a POD (plain old data) value type: + * \code + * class FunctorType { // For POD value type + * public: + * typedef ... execution_space ; + * typedef <podType> value_type ; + * void operator()( <intType> iwork , <podType> & update ) const ; + * void init( <podType> & update ) const ; + * void join( volatile <podType> & update , + * volatile const <podType> & input ) const ; + * + * typedef true_type has_final ; + * void final( <podType> & update ) const ; + * }; + * \endcode + * + * Example of a parallel_reduce functor for an array of POD (plain old data) values: + * \code + * class FunctorType { // For array of POD value + * public: + * typedef ... execution_space ; + * typedef <podType> value_type[] ; + * void operator()( <intType> , <podType> update[] ) const ; + * void init( <podType> update[] ) const ; + * void join( volatile <podType> update[] , + * volatile const <podType> input[] ) const ; + * + * typedef true_type has_final ; + * void final( <podType> update[] ) const ; + * }; + * \endcode + */ +template< class ExecPolicy , class FunctorType > +inline +void parallel_reduce( const ExecPolicy & policy + , const FunctorType & functor + , const std::string& str = "" + , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0 + ) +{ + // typedef typename + // Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space + // execution_space ; + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ; + + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + + Kokkos::View< value_type + , HostSpace + , Kokkos::MemoryUnmanaged + > + result_view ; + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , result_view ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelReduce(kpID); + } +#endif +} + +// integral range policy +template< class FunctorType > +inline +void parallel_reduce( const size_t work_count + , const FunctorType & functor + , const std::string& str = "" + ) +{ + typedef typename + Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space + execution_space ; + + typedef RangePolicy< execution_space > policy ; + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + + Kokkos::View< value_type + , HostSpace + , Kokkos::MemoryUnmanaged + > + result_view ; + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , result_view ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelReduce(kpID); + } +#endif + +} + +// general policy and view ouput +template< class ExecPolicy , class FunctorType , class ViewType > +inline +void parallel_reduce( const ExecPolicy & policy + , const FunctorType & functor + , const ViewType & result_view + , const std::string& str = "" + , typename Impl::enable_if< + ( Impl::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value +#ifdef KOKKOS_HAVE_CUDA + && ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value +#endif + )>::type * = 0 ) +{ + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelReduce(kpID); + } +#endif + +} + +// general policy and pod or array of pod output +template< class ExecPolicy , class FunctorType > +void parallel_reduce( const ExecPolicy & policy + , const FunctorType & functor +#ifdef KOKKOS_HAVE_CUDA + , typename Impl::enable_if< + ( ! Impl::is_integral< ExecPolicy >::value && + ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value ) + , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type>::type result_ref + , const std::string& str = "" + , typename Impl::enable_if<! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value >::type* = 0 + ) +#else + , typename Impl::enable_if< + ( ! Impl::is_integral< ExecPolicy >::value) + , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type + >::type result_ref + , const std::string& str = "" + ) +#endif +{ + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ; + + // Wrap the result output request in a view to inform the implementation + // of the type and memory space. + + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + + Kokkos::View< value_type + , HostSpace + , Kokkos::MemoryUnmanaged + > + result_view( ValueOps::pointer( result_ref ) + , ValueTraits::value_count( functor ) + ); + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelReduce(kpID); + } +#endif + +} + +// integral range policy and view ouput +template< class FunctorType , class ViewType > +inline +void parallel_reduce( const size_t work_count + , const FunctorType & functor + , const ViewType & result_view + , const std::string& str = "" + , typename Impl::enable_if<( Impl::is_view<ViewType>::value +#ifdef KOKKOS_HAVE_CUDA + && ! Impl::is_same< + typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space, + Kokkos::Cuda>::value +#endif + )>::type * = 0 ) +{ + typedef typename + Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space + execution_space ; + + typedef RangePolicy< execution_space > ExecPolicy ; + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , ExecPolicy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelReduce(kpID); + } +#endif + +} + +// integral range policy and pod or array of pod output +template< class FunctorType > +inline +void parallel_reduce( const size_t work_count + , const FunctorType & functor + , typename Kokkos::Impl::FunctorValueTraits< + typename Impl::if_c<Impl::is_execution_policy<FunctorType>::value || + Impl::is_integral<FunctorType>::value, + void,FunctorType>::type + , void >::reference_type result + , const std::string& str = "" + , typename Impl::enable_if< true +#ifdef KOKKOS_HAVE_CUDA + && ! Impl::is_same< + typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space, + Kokkos::Cuda>::value +#endif + >::type * = 0 ) +{ + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ; + + typedef typename + Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space + execution_space ; + + typedef Kokkos::RangePolicy< execution_space > policy ; + + // Wrap the result output request in a view to inform the implementation + // of the type and memory space. + + typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0) + , typename ValueTraits::value_type + , typename ValueTraits::pointer_type + >::type value_type ; + + Kokkos::View< value_type + , HostSpace + , Kokkos::MemoryUnmanaged + > + result_view( ValueOps::pointer( result ) + , ValueTraits::value_count( functor ) + ); + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelReduce(kpID); + } +#endif + +} + +template< class ExecPolicy , class FunctorType , class ResultType > +inline +void parallel_reduce( const std::string & str + , const ExecPolicy & policy + , const FunctorType & functor + , ResultType * result) +{ + #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl; + #endif + + parallel_reduce(policy,functor,result,str); + + #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl; + #endif + (void) str; +} + +template< class ExecPolicy , class FunctorType , class ResultType > +inline +void parallel_reduce( const std::string & str + , const ExecPolicy & policy + , const FunctorType & functor + , ResultType & result) +{ + #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl; + #endif + + parallel_reduce(policy,functor,result,str); + + #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl; + #endif + (void) str; +} + +template< class ExecPolicy , class FunctorType > +inline +void parallel_reduce( const std::string & str + , const ExecPolicy & policy + , const FunctorType & functor) +{ + #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl; + #endif + + parallel_reduce(policy,functor,str); + + #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl; + #endif + (void) str; +} + + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/// \fn parallel_scan +/// \tparam ExecutionPolicy The execution policy type. +/// \tparam FunctorType The scan functor type. +/// +/// \param policy [in] The execution policy. +/// \param functor [in] The scan functor. +/// +/// This function implements a parallel scan pattern. The scan can +/// be either inclusive or exclusive, depending on how you implement +/// the scan functor. +/// +/// A scan functor looks almost exactly like a reduce functor, except +/// that its operator() takes a third \c bool argument, \c final_pass, +/// which indicates whether this is the last pass of the scan +/// operation. We will show below how to use the \c final_pass +/// argument to control whether the scan is inclusive or exclusive. +/// +/// Here is the minimum required interface of a scan functor for a POD +/// (plain old data) value type \c PodType. That is, the result is a +/// View of zero or more PodType. It is also possible for the result +/// to be an array of (same-sized) arrays of PodType, but we do not +/// show the required interface for that here. +/// \code +/// template< class ExecPolicy , class FunctorType > +/// class ScanFunctor { +/// public: +/// // The Kokkos device type +/// typedef ... execution_space; +/// // Type of an entry of the array containing the result; +/// // also the type of each of the entries combined using +/// // operator() or join(). +/// typedef PodType value_type; +/// +/// void operator () (const ExecPolicy::member_type & i, value_type& update, const bool final_pass) const; +/// void init (value_type& update) const; +/// void join (volatile value_type& update, volatile const value_type& input) const +/// }; +/// \endcode +/// +/// Here is an example of a functor which computes an inclusive plus-scan +/// of an array of \c int, in place. If given an array [1, 2, 3, 4], this +/// scan will overwrite that array with [1, 3, 6, 10]. +/// +/// \code +/// template<class SpaceType> +/// class InclScanFunctor { +/// public: +/// typedef SpaceType execution_space; +/// typedef int value_type; +/// typedef typename SpaceType::size_type size_type; +/// +/// InclScanFunctor( Kokkos::View<value_type*, execution_space> x +/// , Kokkos::View<value_type*, execution_space> y ) : m_x(x), m_y(y) {} +/// +/// void operator () (const size_type i, value_type& update, const bool final_pass) const { +/// update += m_x(i); +/// if (final_pass) { +/// m_y(i) = update; +/// } +/// } +/// void init (value_type& update) const { +/// update = 0; +/// } +/// void join (volatile value_type& update, volatile const value_type& input) const { +/// update += input; +/// } +/// +/// private: +/// Kokkos::View<value_type*, execution_space> m_x; +/// Kokkos::View<value_type*, execution_space> m_y; +/// }; +/// \endcode +/// +/// Here is an example of a functor which computes an <i>exclusive</i> +/// scan of an array of \c int, in place. In operator(), note both +/// that the final_pass test and the update have switched places, and +/// the use of a temporary. If given an array [1, 2, 3, 4], this scan +/// will overwrite that array with [0, 1, 3, 6]. +/// +/// \code +/// template<class SpaceType> +/// class ExclScanFunctor { +/// public: +/// typedef SpaceType execution_space; +/// typedef int value_type; +/// typedef typename SpaceType::size_type size_type; +/// +/// ExclScanFunctor (Kokkos::View<value_type*, execution_space> x) : x_ (x) {} +/// +/// void operator () (const size_type i, value_type& update, const bool final_pass) const { +/// const value_type x_i = x_(i); +/// if (final_pass) { +/// x_(i) = update; +/// } +/// update += x_i; +/// } +/// void init (value_type& update) const { +/// update = 0; +/// } +/// void join (volatile value_type& update, volatile const value_type& input) const { +/// update += input; +/// } +/// +/// private: +/// Kokkos::View<value_type*, execution_space> x_; +/// }; +/// \endcode +/// +/// Here is an example of a functor which builds on the above +/// exclusive scan example, to compute an offsets array from a +/// population count array, in place. We assume that the pop count +/// array has an extra entry at the end to store the final count. If +/// given an array [1, 2, 3, 4, 0], this scan will overwrite that +/// array with [0, 1, 3, 6, 10]. +/// +/// \code +/// template<class SpaceType> +/// class OffsetScanFunctor { +/// public: +/// typedef SpaceType execution_space; +/// typedef int value_type; +/// typedef typename SpaceType::size_type size_type; +/// +/// // lastIndex_ is the last valid index (zero-based) of x. +/// // If x has length zero, then lastIndex_ won't be used anyway. +/// OffsetScanFunctor( Kokkos::View<value_type*, execution_space> x +/// , Kokkos::View<value_type*, execution_space> y ) +/// : m_x(x), m_y(y), last_index_ (x.dimension_0 () == 0 ? 0 : x.dimension_0 () - 1) +/// {} +/// +/// void operator () (const size_type i, int& update, const bool final_pass) const { +/// if (final_pass) { +/// m_y(i) = update; +/// } +/// update += m_x(i); +/// // The last entry of m_y gets the final sum. +/// if (final_pass && i == last_index_) { +/// m_y(i+1) = update; +/// } +/// } +/// void init (value_type& update) const { +/// update = 0; +/// } +/// void join (volatile value_type& update, volatile const value_type& input) const { +/// update += input; +/// } +/// +/// private: +/// Kokkos::View<value_type*, execution_space> m_x; +/// Kokkos::View<value_type*, execution_space> m_y; +/// const size_type last_index_; +/// }; +/// \endcode +/// +template< class ExecutionPolicy , class FunctorType > +inline +void parallel_scan( const ExecutionPolicy & policy + , const FunctorType & functor + , const std::string& str = "" + , typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0 + ) +{ +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + Impl::ParallelScan< FunctorType , ExecutionPolicy > scan( Impl::CopyWithoutTracking::apply(functor) , policy ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelScan(kpID); + } +#endif + +} + +template< class FunctorType > +inline +void parallel_scan( const size_t work_count + , const FunctorType & functor + , const std::string& str = "" ) +{ + typedef typename + Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space + execution_space ; + + typedef Kokkos::RangePolicy< execution_space > policy ; + +#ifdef KOKKOSP_ENABLE_PROFILING + uint64_t kpID = 0; + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); + } +#endif + + (void) Impl::ParallelScan< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) ); + +#ifdef KOKKOSP_ENABLE_PROFILING + if(Kokkos::Experimental::profileLibraryLoaded()) { + Kokkos::Experimental::endParallelScan(kpID); + } +#endif + +} + +template< class ExecutionPolicy , class FunctorType > +inline +void parallel_scan( const std::string& str + , const ExecutionPolicy & policy + , const FunctorType & functor) +{ + #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl; + #endif + + parallel_scan(policy,functor,str); + + #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl; + #endif + (void) str; +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class Enable = void > +struct FunctorTeamShmemSize +{ + static inline size_t value( const FunctorType & , int ) { return 0 ; } +}; + +template< class FunctorType > +struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type > +{ + static inline size_t value( const FunctorType & f , int team_size ) { return f.team_shmem_size( team_size ) ; } +}; + +template< class FunctorType > +struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::shmem_size ) >::type > +{ + static inline size_t value( const FunctorType & f , int team_size ) { return f.shmem_size( team_size ) ; } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* KOKKOS_PARALLEL_HPP */ + diff --git a/lib/kokkos/core/src/Kokkos_Qthread.hpp b/lib/kokkos/core/src/Kokkos_Qthread.hpp new file mode 100755 index 0000000000000000000000000000000000000000..4f12c02ba0096b57a34ffef6a945d567db33e83c --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Qthread.hpp @@ -0,0 +1,165 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_QTHREAD_HPP +#define KOKKOS_QTHREAD_HPP + +#include <cstddef> +#include <iosfwd> +#include <Kokkos_Core.hpp> +#include <Kokkos_Layout.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_ExecPolicy.hpp> +#include <impl/Kokkos_Tags.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +class QthreadExec ; +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/** \brief Execution space supported by Qthread */ +class Qthread { +public: + //! \name Type declarations that all Kokkos devices must provide. + //@{ + + //! Tag this class as an execution space + typedef Qthread execution_space ; + typedef Kokkos::HostSpace memory_space ; + //! This execution space preferred device_type + typedef Kokkos::Device<execution_space,memory_space> device_type; + + typedef Kokkos::LayoutRight array_layout ; + typedef memory_space::size_type size_type ; + + typedef ScratchMemorySpace< Qthread > scratch_memory_space ; + + //@} + /*------------------------------------------------------------------------*/ + + /** \brief Initialization will construct one or more instances */ + static Qthread & instance( int = 0 ); + + /** \brief Set the execution space to a "sleep" state. + * + * This function sets the "sleep" state in which it is not ready for work. + * This may consume less resources than in an "ready" state, + * but it may also take time to transition to the "ready" state. + * + * \return True if enters or is in the "sleep" state. + * False if functions are currently executing. + */ + bool sleep(); + + /** \brief Wake from the sleep state. + * + * \return True if enters or is in the "ready" state. + * False if functions are currently executing. + */ + static bool wake(); + + /** \brief Wait until all dispatched functions to complete. + * + * The parallel_for or parallel_reduce dispatch of a functor may + * return asynchronously, before the functor completes. This + * method does not return until all dispatched functors on this + * device have completed. + */ + static void fence(); + + /*------------------------------------------------------------------------*/ + + static void initialize( int thread_count ); + static void finalize(); + + /** \brief Print configuration information to the given output stream. */ + static void print_configuration( std::ostream & , const bool detail = false ); + + int shepherd_size() const ; + int shepherd_worker_size() const ; +}; + +/*--------------------------------------------------------------------------*/ + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template<> +struct VerifyExecutionCanAccessMemorySpace + < Kokkos::Qthread::memory_space + , Kokkos::Qthread::scratch_memory_space + > +{ + enum { value = true }; + inline static void verify( void ) { } + inline static void verify( const void * ) { } +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +#include <Kokkos_Parallel.hpp> +#include <Qthread/Kokkos_QthreadExec.hpp> +#include <Qthread/Kokkos_Qthread_Parallel.hpp> + +#endif /* #define KOKKOS_QTHREAD_HPP */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp new file mode 100755 index 0000000000000000000000000000000000000000..6e5b4f96242b0f9af803a71643182528017271ae --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp @@ -0,0 +1,125 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SCRATCHSPACE_HPP +#define KOKKOS_SCRATCHSPACE_HPP + +#include <stdio.h> +#include <Kokkos_Core_fwd.hpp> +#include <impl/Kokkos_Tags.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/** \brief Scratch memory space associated with an execution space. + * + */ +template< class ExecSpace > +class ScratchMemorySpace { +public: + + // Alignment of memory chunks returned by 'get' + // must be a power of two + enum { ALIGN = 8 }; + +private: + + mutable char * m_iter ; + char * m_end ; + + ScratchMemorySpace(); + ScratchMemorySpace & operator = ( const ScratchMemorySpace & ); + + enum { MASK = ALIGN - 1 }; // Alignment used by View::shmem_size + +public: + + //! Tag this class as a memory space + typedef ScratchMemorySpace memory_space ; + typedef ExecSpace execution_space ; + //! This execution space preferred device_type + typedef Kokkos::Device<execution_space,memory_space> device_type; + + typedef typename ExecSpace::array_layout array_layout ; + typedef typename ExecSpace::size_type size_type ; + + template< typename IntType > + KOKKOS_INLINE_FUNCTION static + IntType align( const IntType & size ) + { return ( size + MASK ) & ~MASK ; } + + template< typename IntType > + KOKKOS_INLINE_FUNCTION + void* get_shmem (const IntType& size) const { + void* tmp = m_iter ; + if (m_end < (m_iter += align (size))) { + m_iter -= align (size); // put it back like it was + #ifdef KOKKOS_HAVE_DEBUG + // mfh 23 Jun 2015: printf call consumes 25 registers + // in a CUDA build, so only print in debug mode. The + // function still returns NULL if not enough memory. + printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate " + "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size), + long(m_end-m_iter)); + #endif // KOKKOS_HAVE_DEBUG + tmp = 0; + } + return tmp; + } + + template< typename IntType > + KOKKOS_INLINE_FUNCTION + ScratchMemorySpace( void * ptr , const IntType & size ) + : m_iter( (char *) ptr ) + , m_end( m_iter + size ) + {} +}; + +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp new file mode 100755 index 0000000000000000000000000000000000000000..5773a18b3f4c9288070be0f2a6e398d714b68ee3 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Serial.hpp @@ -0,0 +1,892 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_Serial.hpp +/// \brief Declaration and definition of Kokkos::Serial device. + +#ifndef KOKKOS_SERIAL_HPP +#define KOKKOS_SERIAL_HPP + +#include <cstddef> +#include <iosfwd> +#include <Kokkos_Parallel.hpp> +#include <Kokkos_Layout.hpp> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> + +#if defined( KOKKOS_HAVE_SERIAL ) + +namespace Kokkos { + +/// \class Serial +/// \brief Kokkos device for non-parallel execution +/// +/// A "device" represents a parallel execution model. It tells Kokkos +/// how to parallelize the execution of kernels in a parallel_for or +/// parallel_reduce. For example, the Threads device uses Pthreads or +/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language +/// extensions, and the Cuda device uses NVIDIA's CUDA programming +/// model. The Serial device executes "parallel" kernels +/// sequentially. This is useful if you really do not want to use +/// threads, or if you want to explore different combinations of MPI +/// and shared-memory parallel programming models. +class Serial { +public: + //! \name Type declarations that all Kokkos devices must provide. + //@{ + + //! Tag this class as an execution space: + typedef Serial execution_space ; + //! The size_type typedef best suited for this device. + typedef HostSpace::size_type size_type ; + //! This device's preferred memory space. + typedef HostSpace memory_space ; + //! This execution space preferred device_type + typedef Kokkos::Device<execution_space,memory_space> device_type; + + //! This device's preferred array layout. + typedef LayoutRight array_layout ; + + /// \brief Scratch memory space + typedef ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ; + + //@} + + /// \brief True if and only if this method is being called in a + /// thread-parallel function. + /// + /// For the Serial device, this method <i>always</i> returns false, + /// because parallel_for or parallel_reduce with the Serial device + /// always execute sequentially. + inline static int in_parallel() { return false ; } + + /** \brief Set the device in a "sleep" state. + * + * This function sets the device in a "sleep" state in which it is + * not ready for work. This may consume less resources than if the + * device were in an "awake" state, but it may also take time to + * bring the device from a sleep state to be ready for work. + * + * \return True if the device is in the "sleep" state, else false if + * the device is actively working and could not enter the "sleep" + * state. + */ + static bool sleep(); + + /// \brief Wake the device from the 'sleep' state so it is ready for work. + /// + /// \return True if the device is in the "ready" state, else "false" + /// if the device is actively working (which also means that it's + /// awake). + static bool wake(); + + /// \brief Wait until all dispatched functors complete. + /// + /// The parallel_for or parallel_reduce dispatch of a functor may + /// return asynchronously, before the functor completes. This + /// method does not return until all dispatched functors on this + /// device have completed. + static void fence() {} + + static void initialize( unsigned threads_count = 1 , + unsigned use_numa_count = 0 , + unsigned use_cores_per_numa = 0 , + bool allow_asynchronous_threadpool = false) { + (void) threads_count; + (void) use_numa_count; + (void) use_cores_per_numa; + (void) allow_asynchronous_threadpool; + + // Init the array of locks used for arbitrarily sized atomics + Impl::init_lock_array_host_space(); + + } + + static int is_initialized() { return 1 ; } + + //! Free any resources being consumed by the device. + static void finalize() {} + + //! Print configuration information to the given output stream. + static void print_configuration( std::ostream & , const bool detail = false ) {} + + //-------------------------------------------------------------------------- + + inline static int thread_pool_size( int = 0 ) { return 1 ; } + KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; } + + //-------------------------------------------------------------------------- + + KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); } + inline static unsigned max_hardware_threads() { return thread_pool_size(0); } + + //-------------------------------------------------------------------------- + + static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size ); + + //-------------------------------------------------------------------------- +}; + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template<> +struct VerifyExecutionCanAccessMemorySpace + < Kokkos::Serial::memory_space + , Kokkos::Serial::scratch_memory_space + > +{ + enum { value = true }; + inline static void verify( void ) { } + inline static void verify( const void * ) { } +}; + +namespace SerialImpl { + +struct Sentinel { + + void * m_scratch ; + unsigned m_reduce_end ; + unsigned m_shared_end ; + + Sentinel(); + ~Sentinel(); + static Sentinel & singleton(); +}; + +inline +unsigned align( unsigned n ); +} +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +class SerialTeamMember { +private: + typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ; + const scratch_memory_space m_space ; + const int m_league_rank ; + const int m_league_size ; + + SerialTeamMember & operator = ( const SerialTeamMember & ); + +public: + + KOKKOS_INLINE_FUNCTION + const scratch_memory_space & team_shmem() const { return m_space ; } + + KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; } + KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; } + KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; } + KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; } + + KOKKOS_INLINE_FUNCTION void team_barrier() const {} + + template<class ValueType> + KOKKOS_INLINE_FUNCTION + void team_broadcast(const ValueType& , const int& ) const {} + + template< class ValueType, class JoinOp > + KOKKOS_INLINE_FUNCTION + ValueType team_reduce( const ValueType & value , const JoinOp & ) const + { + return value ; + } + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template< typename Type > + KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const + { + const Type tmp = global_accum ? *global_accum : Type(0) ; + if ( global_accum ) { *global_accum += value ; } + return tmp ; + } + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template< typename Type > + KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const + { return Type(0); } + + //---------------------------------------- + // Execution space specific: + + SerialTeamMember( int arg_league_rank + , int arg_league_size + , int arg_shared_size + ); +}; + +} // namespace Impl + + +/* + * < Kokkos::Serial , WorkArgTag > + * < WorkArgTag , Impl::enable_if< Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type > + * + */ +template< class Arg0 , class Arg1 > +class TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > +{ +private: + + const int m_league_size ; + +public: + + //! Tag this class as a kokkos execution policy + typedef TeamPolicy execution_policy ; + + //! Execution space of this execution policy: + typedef Kokkos::Serial execution_space ; + + typedef typename + Impl::if_c< ! Impl::is_same< Kokkos::Serial , Arg0 >::value , Arg0 , Arg1 >::type + work_tag ; + + //---------------------------------------- + + template< class FunctorType > + static + int team_size_max( const FunctorType & ) { return 1 ; } + + template< class FunctorType > + static + int team_size_recommended( const FunctorType & ) { return 1 ; } + + template< class FunctorType > + static + int team_size_recommended( const FunctorType & , const int& ) { return 1 ; } + + //---------------------------------------- + + inline int team_size() const { return 1 ; } + inline int league_size() const { return m_league_size ; } + + /** \brief Specify league size, request team size */ + TeamPolicy( execution_space & , int league_size_request , int /* team_size_request */ , int vector_length_request = 1 ) + : m_league_size( league_size_request ) + { (void) vector_length_request; } + + TeamPolicy( int league_size_request , int /* team_size_request */ , int vector_length_request = 1 ) + : m_league_size( league_size_request ) + { (void) vector_length_request; } + + typedef Impl::SerialTeamMember member_type ; +}; + +} /* namespace Kokkos */ + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ; + +public: + // work tag is void + template< class PType > + inline + ParallelFor( typename Impl::enable_if< + ( Impl::is_same< PType , Policy >::value && + Impl::is_same< typename PType::work_tag , void >::value + ), const FunctorType & >::type functor + , const PType & policy ) + { + const typename PType::member_type e = policy.end(); + for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) { + functor( i ); + } + } + + // work tag is non-void + template< class PType > + inline + ParallelFor( typename Impl::enable_if< + ( Impl::is_same< PType , Policy >::value && + ! Impl::is_same< typename PType::work_tag , void >::value + ), const FunctorType & >::type functor + , const PType & policy ) + { + const typename PType::member_type e = policy.end(); + for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) { + functor( typename PType::work_tag() , i ); + } + } +}; + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > > +{ +public: + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ; + typedef typename Policy::work_tag WorkTag ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + // Work tag is void + template< class ViewType , class PType > + ParallelReduce( typename Impl::enable_if< + ( Impl::is_view< ViewType >::value && + Impl::is_same< typename ViewType::memory_space , HostSpace >::value && + Impl::is_same< PType , Policy >::value && + Impl::is_same< typename PType::work_tag , void >::value + ), const FunctorType & >::type functor + , const PType & policy + , const ViewType & result + ) + { + pointer_type result_ptr = result.ptr_on_device(); + + if ( ! result_ptr ) { + result_ptr = (pointer_type) + Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 ); + } + + reference_type update = ValueInit::init( functor , result_ptr ); + + const typename PType::member_type e = policy.end(); + for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) { + functor( i , update ); + } + + Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr ); + } + + // Work tag is non-void + template< class ViewType , class PType > + ParallelReduce( typename Impl::enable_if< + ( Impl::is_view< ViewType >::value && + Impl::is_same< typename ViewType::memory_space , HostSpace >::value && + Impl::is_same< PType , Policy >::value && + ! Impl::is_same< typename PType::work_tag , void >::value + ), const FunctorType & >::type functor + , const PType & policy + , const ViewType & result + ) + { + pointer_type result_ptr = result.ptr_on_device(); + + if ( ! result_ptr ) { + result_ptr = (pointer_type) + Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 ); + } + + typename ValueTraits::reference_type update = ValueInit::init( functor , result_ptr ); + + const typename PType::member_type e = policy.end(); + for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) { + functor( typename PType::work_tag() , i , update ); + } + + Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr ); + } +}; + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ; + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ; + +public: + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + // work tag is void + template< class PType > + inline + ParallelScan( typename Impl::enable_if< + ( Impl::is_same< PType , Policy >::value && + Impl::is_same< typename PType::work_tag , void >::value + ), const FunctorType & >::type functor + , const PType & policy ) + { + pointer_type result_ptr = (pointer_type) + Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 ); + + reference_type update = ValueInit::init( functor , result_ptr ); + + const typename PType::member_type e = policy.end(); + for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) { + functor( i , update , true ); + } + + Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr ); + } + + // work tag is non-void + template< class PType > + inline + ParallelScan( typename Impl::enable_if< + ( Impl::is_same< PType , Policy >::value && + ! Impl::is_same< typename PType::work_tag , void >::value + ), const FunctorType & >::type functor + , const PType & policy ) + { + pointer_type result_ptr = (pointer_type) + Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 ); + + reference_type update = ValueInit::init( functor , result_ptr ); + + const typename PType::member_type e = policy.end(); + for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) { + functor( typename PType::work_tag() , i , update , true ); + } + + Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr ); + } +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class Arg0 , class Arg1 > +class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > > +{ +private: + + typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ; + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value , + const FunctorType & >::type functor + , const typename Policy::member_type & member ) + { functor( member ); } + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value , + const FunctorType & >::type functor + , const typename Policy::member_type & member ) + { functor( TagType() , member ); } + +public: + + ParallelFor( const FunctorType & functor + , const Policy & policy ) + { + const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ); + + Kokkos::Serial::scratch_memory_resize( 0 , shared_size ); + + for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) { + ParallelFor::template driver< typename Policy::work_tag > + ( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) ); + // functor( typename Policy::member_type(ileague,policy.league_size(),shared_size) ); + } + } +}; + +template< class FunctorType , class Arg0 , class Arg1 > +class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > > +{ +private: + + typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ; + +public: + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + +private: + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value , + const FunctorType & >::type functor + , const typename Policy::member_type & member + , reference_type update ) + { functor( member , update ); } + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value , + const FunctorType & >::type functor + , const typename Policy::member_type & member + , reference_type update ) + { functor( TagType() , member , update ); } + +public: + + template< class ViewType > + ParallelReduce( const FunctorType & functor + , const Policy & policy + , const ViewType & result + ) + { + const int reduce_size = ValueTraits::value_size( functor ); + const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ); + void * const scratch_reduce = Kokkos::Serial::scratch_memory_resize( reduce_size , shared_size ); + + const pointer_type result_ptr = + result.ptr_on_device() ? result.ptr_on_device() + : (pointer_type) scratch_reduce ; + + reference_type update = ValueInit::init( functor , result_ptr ); + + for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) { + ParallelReduce::template driver< typename Policy::work_tag > + ( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) , update ); + } + + Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr ); + } +}; + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { + +namespace Impl { + +template<typename iType> +struct TeamThreadRangeBoundariesStruct<iType,SerialTeamMember> { + typedef iType index_type; + const iType begin ; + const iType end ; + enum {increment = 1}; + const SerialTeamMember& thread; + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_count) + : begin(0) + , end(arg_count) + , thread(arg_thread) + {} + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_begin, const iType & arg_end ) + : begin( arg_begin ) + , end( arg_end) + , thread( arg_thread ) + {} +}; + + template<typename iType> + struct ThreadVectorRangeBoundariesStruct<iType,SerialTeamMember> { + typedef iType index_type; + enum {start = 0}; + const iType end; + enum {increment = 1}; + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct (const SerialTeamMember& thread, const iType& count): + end( count ) + {} + }; + +} // namespace Impl + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember> +TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & count ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>(thread,count); +} + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember> +TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & begin , const iType & end ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>(thread,begin,end); +} + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember > + ThreadVectorRange(const Impl::SerialTeamMember& thread, const iType& count) { + return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >(thread,count); +} + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct<Impl::SerialTeamMember> PerTeam(const Impl::SerialTeamMember& thread) { + return Impl::ThreadSingleStruct<Impl::SerialTeamMember>(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct<Impl::SerialTeamMember> PerThread(const Impl::SerialTeamMember& thread) { + return Impl::VectorSingleStruct<Impl::SerialTeamMember>(thread); +} + +} // namespace Kokkos + +namespace Kokkos { + + /** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + * This functionality requires C++11 support.*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, const Lambda& lambda) { + for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i); +} + +/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of + * val is performed and put into result. This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, + const Lambda & lambda, ValueType& result) { + + result = ValueType(); + + for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + result+=tmp; + } + + result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>()); +} + +#ifdef KOKKOS_HAVE_CXX11 + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of + * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result. + * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore + * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or + * '1 for *'). This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, + const Lambda & lambda, const JoinType& join, ValueType& init_result) { + + ValueType result = init_result; + + for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + join(result,tmp); + } + + init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join)); +} + +#endif // KOKKOS_HAVE_CXX11 + +} //namespace Kokkos + +namespace Kokkos { +/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. + * This functionality requires C++11 support.*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >& + loop_boundaries, const Lambda& lambda) { + #ifdef KOKKOS_HAVE_PRAGMA_IVDEP + #pragma ivdep + #endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i); +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of + * val is performed and put into result. This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >& + loop_boundaries, const Lambda & lambda, ValueType& result) { + result = ValueType(); +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + result+=tmp; + } +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of + * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result. + * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore + * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or + * '1 for *'). This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >& + loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) { + + ValueType result = init_result; +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + join(result,tmp); + } + init_result = result; +} + +/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final) + * for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed. + * Depending on the target execution space the operator might be called twice: once with final=false + * and once with final=true. When final==true val contains the prefix sum value. The contribution of this + * "i" needs to be added to val no matter whether final==true or not. In a serial execution + * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set + * to the final sum value over all vector lanes. + * This functionality requires C++11 support.*/ +template< typename iType, class FunctorType > +KOKKOS_INLINE_FUNCTION +void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >& + loop_boundaries, const FunctorType & lambda) { + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef typename ValueTraits::value_type value_type ; + + value_type scan_val = value_type(); + +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,scan_val,true); + } +} + +} // namespace Kokkos + +namespace Kokkos { + +template<class FunctorType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) { + lambda(); +} + +template<class FunctorType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) { + lambda(); +} + +template<class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) { + lambda(val); +} + +template<class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) { + lambda(val); +} +} + +#endif // defined( KOKKOS_HAVE_SERIAL ) +#endif /* #define KOKKOS_SERIAL_HPP */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + diff --git a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp new file mode 100755 index 0000000000000000000000000000000000000000..6f6453fd46f1e90cc8ee5f6edd119f45843078d4 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp @@ -0,0 +1,376 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_TASKPOLICY_HPP +#define KOKKOS_TASKPOLICY_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_StaticAssert.hpp> +#include <impl/Kokkos_AllocationTracker.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +struct FutureValueTypeIsVoidError {}; + +template < class ExecSpace , class ResultType , class FunctorType > +class TaskMember ; + +template< class ExecPolicy , class ResultType , class FunctorType > +class TaskForEach ; + +template< class ExecPolicy , class ResultType , class FunctorType > +class TaskReduce ; + +template< class ExecPolicy , class ResultType , class FunctorType > +struct TaskScan ; + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +/**\brief States of a task */ +enum TaskState + { TASK_STATE_NULL = 0 ///< Does not exist + , TASK_STATE_CONSTRUCTING = 1 ///< Is under construction + , TASK_STATE_WAITING = 2 ///< Is waiting for execution + , TASK_STATE_EXECUTING = 4 ///< Is executing + , TASK_STATE_COMPLETE = 8 ///< Execution is complete + }; + +/** + * + * Future< space > // value_type == void + * Future< value > // space == Default + * Future< value , space > + * + */ +template< class Arg1 = void , class Arg2 = void > +class Future { +private: + + template< class , class , class > friend class Impl::TaskMember ; + template< class > friend class TaskPolicy ; + template< class , class > friend class Future ; + + // Argument #2, if not void, must be the space. + enum { Arg1_is_space = Kokkos::Impl::is_execution_space< Arg1 >::value }; + enum { Arg2_is_space = Kokkos::Impl::is_execution_space< Arg2 >::value }; + enum { Arg2_is_void = Kokkos::Impl::is_same< Arg2 , void >::value }; + + struct ErrorNoExecutionSpace {}; + + enum { Opt1 = Arg1_is_space && Arg2_is_void + , Opt2 = ! Arg1_is_space && Arg2_is_void + , Opt3 = ! Arg1_is_space && Arg2_is_space + , OptOK = Kokkos::Impl::StaticAssert< Opt1 || Opt2 || Opt3 , ErrorNoExecutionSpace >::value + }; + + typedef typename + Kokkos::Impl::if_c< Opt2 || Opt3 , Arg1 , void >::type + ValueType ; + + typedef typename + Kokkos::Impl::if_c< Opt1 , Arg1 , typename + Kokkos::Impl::if_c< Opt2 , Kokkos::DefaultExecutionSpace , typename + Kokkos::Impl::if_c< Opt3 , Arg2 , void + >::type >::type >::type + ExecutionSpace ; + + typedef Impl::TaskMember< ExecutionSpace , void , void > TaskRoot ; + typedef Impl::TaskMember< ExecutionSpace , ValueType , void > TaskValue ; + + TaskRoot * m_task ; + +public: + + typedef ValueType value_type; + typedef ExecutionSpace execution_space ; + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + TaskState get_task_state() const + { return 0 != m_task ? m_task->get_state() : TASK_STATE_NULL ; } + + //---------------------------------------- + + explicit + Future( TaskRoot * task ) + : m_task(0) + { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( task ) ); } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + ~Future() { TaskRoot::assign( & m_task , 0 ); } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + Future() : m_task(0) {} + + KOKKOS_INLINE_FUNCTION + Future( const Future & rhs ) + : m_task(0) + { TaskRoot::assign( & m_task , rhs.m_task ); } + + KOKKOS_INLINE_FUNCTION + Future & operator = ( const Future & rhs ) + { TaskRoot::assign( & m_task , rhs.m_task ); return *this ; } + + //---------------------------------------- + + template< class A1 , class A2 > + KOKKOS_INLINE_FUNCTION + Future( const Future<A1,A2> & rhs ) + : m_task(0) + { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); } + + template< class A1 , class A2 > + KOKKOS_INLINE_FUNCTION + Future & operator = ( const Future<A1,A2> & rhs ) + { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); return *this ; } + + //---------------------------------------- + + typedef typename TaskValue::get_result_type get_result_type ; + + KOKKOS_INLINE_FUNCTION + get_result_type get() const + { return static_cast<TaskValue*>( m_task )->get(); } +}; + +namespace Impl { + +template< class T > +struct is_future : public Kokkos::Impl::bool_< false > {}; + +template< class Arg0 , class Arg1 > +struct is_future< Kokkos::Experimental::Future<Arg0,Arg1> > : public Kokkos::Impl::bool_< true > {}; + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +/** \brief If the argument is an execution space then a serial task in that space */ +template< class Arg0 = Kokkos::DefaultExecutionSpace > +class TaskPolicy { +public: + + typedef typename Arg0::execution_space execution_space ; + + //---------------------------------------- + /** \brief Create a serial task with storage for dependences. + * + * Postcondition: Task is in the 'constructing' state. + */ + template< class FunctorType > + Future< typename FunctorType::value_type , execution_space > + create( const FunctorType & functor + , const unsigned dependence_capacity /* = default */ ) const ; + + /** \brief Create a foreach task with storage for dependences. */ + template< class ExecPolicy , class FunctorType > + Future< typename FunctorType::value_type , execution_space > + create_foreach( const ExecPolicy & policy + , const FunctorType & functor + , const unsigned dependence_capacity /* = default */ ) const ; + + /** \brief Create a reduce task with storage for dependences. */ + template< class ExecPolicy , class FunctorType > + Future< typename FunctorType::value_type , execution_space > + create_reduce( const ExecPolicy & policy + , const FunctorType & functor + , const unsigned dependence_capacity /* = default */ ) const ; + + /** \brief Create a scan task with storage for dependences. */ + template< class ExecPolicy , class FunctorType > + Future< typename FunctorType::value_type , execution_space > + create_scan( const ExecPolicy & policy + , const FunctorType & functor + , const unsigned dependence_capacity /* = default */ ) const ; + + /** \brief Set dependence that 'after' cannot start execution + * until 'before' has completed. + * + * Precondition: The 'after' task must be in then 'Constructing' state. + */ + template< class TA , class TB > + void set_dependence( const Future<TA,execution_space> & after + , const Future<TB,execution_space> & before ) const ; + + /** \brief Spawn a task in the 'Constructing' state + * + * Precondition: Task is in the 'constructing' state. + * Postcondition: Task is waiting, executing, or complete. + */ + template< class T > + const Future<T,execution_space> & + spawn( const Future<T,execution_space> & ) const ; + + //---------------------------------------- + /** \brief Query dependence of an executing task */ + + template< class FunctorType > + Future< execution_space > + get_dependence( FunctorType * , const int ) const ; + + //---------------------------------------- + /** \brief Clear current dependences of an executing task + * in preparation for setting new dependences and + * respawning. + * + * Precondition: The functor must be a task in the executing state. + */ + template< class FunctorType > + void clear_dependence( FunctorType * ) const ; + + /** \brief Set dependence that 'after' cannot start execution + * until 'before' has completed. + * + * The 'after' functor must be in the executing state + */ + template< class FunctorType , class TB > + void set_dependence( FunctorType * after + , const Future<TB,execution_space> & before ) const ; + + /** \brief Respawn (reschedule) an executing task to be called again + * after all dependences have completed. + */ + template< class FunctorType > + void respawn( FunctorType * ) const ; +}; + +//---------------------------------------------------------------------------- +/** \brief Create and spawn a single-thread task */ +template< class ExecSpace , class FunctorType > +inline +Future< typename FunctorType::value_type , ExecSpace > +spawn( TaskPolicy<ExecSpace> & policy , const FunctorType & functor ) +{ return policy.spawn( policy.create( functor ) ); } + +/** \brief Create and spawn a single-thread task with dependences */ +template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 > +inline +Future< typename FunctorType::value_type , ExecSpace > +spawn( TaskPolicy<ExecSpace> & policy + , const FunctorType & functor + , const Future<Arg0,Arg1> & before_0 + , const Future<Arg0,Arg1> & before_1 ) +{ + Future< typename FunctorType::value_type , ExecSpace > f ; + f = policy.create( functor , 2 ); + policy.add_dependence( f , before_0 ); + policy.add_dependence( f , before_1 ); + policy.spawn( f ); + return f ; +} + +//---------------------------------------------------------------------------- +/** \brief Create and spawn a parallel_for task */ +template< class ExecSpace , class ParallelPolicyType , class FunctorType > +inline +Future< typename FunctorType::value_type , ExecSpace > +spawn_foreach( TaskPolicy<ExecSpace> & task_policy + , const ParallelPolicyType & parallel_policy + , const FunctorType & functor ) +{ return task_policy.spawn( task_policy.create_foreach( parallel_policy , functor ) ); } + +/** \brief Create and spawn a parallel_reduce task */ +template< class ExecSpace , class ParallelPolicyType , class FunctorType > +inline +Future< typename FunctorType::value_type , ExecSpace > +spawn_reduce( TaskPolicy<ExecSpace> & task_policy + , const ParallelPolicyType & parallel_policy + , const FunctorType & functor ) +{ return task_policy.spawn( task_policy.create_reduce( parallel_policy , functor ) ); } + +//---------------------------------------------------------------------------- +/** \brief Respawn a task functor with dependences */ +template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 > +inline +void respawn( TaskPolicy<ExecSpace> & policy + , FunctorType * functor + , const Future<Arg0,Arg1> & before_0 + , const Future<Arg0,Arg1> & before_1 + ) +{ + policy.clear_dependence( functor ); + policy.add_dependence( functor , before_0 ); + policy.add_dependence( functor , before_1 ); + policy.respawn( functor ); +} + +//---------------------------------------------------------------------------- + +template< class ExecSpace > +void wait( TaskPolicy< ExecSpace > & ); + +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #define KOKKOS_TASKPOLICY_HPP */ + diff --git a/lib/kokkos/core/src/Kokkos_Threads.hpp b/lib/kokkos/core/src/Kokkos_Threads.hpp new file mode 100755 index 0000000000000000000000000000000000000000..4661b714b235d3426b63dc5dcba7e77d514c2258 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Threads.hpp @@ -0,0 +1,217 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADS_HPP +#define KOKKOS_THREADS_HPP + +#include <Kokkos_Core_fwd.hpp> + +#if defined( KOKKOS_HAVE_PTHREAD ) + +#include <cstddef> +#include <iosfwd> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <Kokkos_Layout.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <impl/Kokkos_Tags.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +class ThreadsExec ; +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/** \brief Execution space for a pool of Pthreads or C11 threads on a CPU. */ +class Threads { +public: + //! \name Type declarations that all Kokkos devices must provide. + //@{ + //! Tag this class as a kokkos execution space + typedef Threads execution_space ; + typedef Kokkos::HostSpace memory_space ; + + //! This execution space preferred device_type + typedef Kokkos::Device<execution_space,memory_space> device_type; + + typedef Kokkos::LayoutRight array_layout ; + typedef memory_space::size_type size_type ; + + typedef ScratchMemorySpace< Threads > scratch_memory_space ; + + + //@} + /*------------------------------------------------------------------------*/ + //! \name Static functions that all Kokkos devices must implement. + //@{ + + /// \brief True if and only if this method is being called in a + /// thread-parallel function. + static int in_parallel(); + + /** \brief Set the device in a "sleep" state. + * + * This function sets the device in a "sleep" state in which it is + * not ready for work. This may consume less resources than if the + * device were in an "awake" state, but it may also take time to + * bring the device from a sleep state to be ready for work. + * + * \return True if the device is in the "sleep" state, else false if + * the device is actively working and could not enter the "sleep" + * state. + */ + static bool sleep(); + + /// \brief Wake the device from the 'sleep' state so it is ready for work. + /// + /// \return True if the device is in the "ready" state, else "false" + /// if the device is actively working (which also means that it's + /// awake). + static bool wake(); + + /// \brief Wait until all dispatched functors complete. + /// + /// The parallel_for or parallel_reduce dispatch of a functor may + /// return asynchronously, before the functor completes. This + /// method does not return until all dispatched functors on this + /// device have completed. + static void fence(); + + /// \brief Free any resources being consumed by the device. + /// + /// For the Threads device, this terminates spawned worker threads. + static void finalize(); + + /// \brief Print configuration information to the given output stream. + static void print_configuration( std::ostream & , const bool detail = false ); + + //@} + /*------------------------------------------------------------------------*/ + /*------------------------------------------------------------------------*/ + //! \name Space-specific functions + //@{ + + /** \brief Initialize the device in the "ready to work" state. + * + * The device is initialized in a "ready to work" or "awake" state. + * This state reduces latency and thus improves performance when + * dispatching work. However, the "awake" state consumes resources + * even when no work is being done. You may call sleep() to put + * the device in a "sleeping" state that does not consume as many + * resources, but it will take time (latency) to awaken the device + * again (via the wake()) method so that it is ready for work. + * + * Teams of threads are distributed as evenly as possible across + * the requested number of numa regions and cores per numa region. + * A team will not be split across a numa region. + * + * If the 'use_' arguments are not supplied the hwloc is queried + * to use all available cores. + */ + static void initialize( unsigned threads_count = 0 , + unsigned use_numa_count = 0 , + unsigned use_cores_per_numa = 0 , + bool allow_asynchronous_threadpool = false ); + + static int is_initialized(); + + static Threads & instance( int = 0 ); + + //---------------------------------------- + + static int thread_pool_size( int depth = 0 ); +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + static int thread_pool_rank(); +#else + KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; } +#endif + + inline static unsigned max_hardware_threads() { return thread_pool_size(0); } + KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); } + + //@} + //---------------------------------------- +}; + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template<> +struct VerifyExecutionCanAccessMemorySpace + < Kokkos::Threads::memory_space + , Kokkos::Threads::scratch_memory_space + > +{ + enum { value = true }; + inline static void verify( void ) { } + inline static void verify( const void * ) { } +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +#include <Kokkos_ExecPolicy.hpp> +#include <Kokkos_Parallel.hpp> +#include <Threads/Kokkos_ThreadsExec.hpp> +#include <Threads/Kokkos_ThreadsTeam.hpp> +#include <Threads/Kokkos_Threads_Parallel.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */ +#endif /* #define KOKKOS_THREADS_HPP */ + + diff --git a/lib/kokkos/core/src/Kokkos_Vectorization.hpp b/lib/kokkos/core/src/Kokkos_Vectorization.hpp new file mode 100755 index 0000000000000000000000000000000000000000..a60c0ecaa7b83bd49fb187bf37ca5a84d6360744 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Vectorization.hpp @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_Vectorization.hpp +/// \brief Declaration and definition of Kokkos::Vectorization interface. +#ifndef KOKKOS_VECTORIZATION_HPP +#define KOKKOS_VECTORIZATION_HPP + +#if defined( KOKKOS_HAVE_CUDA ) +#include <Cuda/Kokkos_Cuda_Vectorization.hpp> +#endif + +#endif diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp new file mode 100755 index 0000000000000000000000000000000000000000..cd6c8af9fedffb849e0cb8de8a5160e8557d1ffe --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -0,0 +1,1915 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_VIEW_HPP +#define KOKKOS_VIEW_HPP + +#include <string> +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_MemoryTraits.hpp> + +#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + +#include <impl/Kokkos_StaticAssert.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Shape.hpp> +#include <impl/Kokkos_AnalyzeShape.hpp> +#include <impl/Kokkos_ViewOffset.hpp> +#include <impl/Kokkos_ViewSupport.hpp> +#include <impl/Kokkos_Tags.hpp> +#include <type_traits> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief View specialization mapping of view traits to a specialization tag */ +template< class ValueType , + class ArraySpecialize , + class ArrayLayout , + class MemorySpace , + class MemoryTraits > +struct ViewSpecialize ; + +/** \brief Defines the type of a subview given a source view type + * and subview argument types. + */ +template< class SrcViewType + , class Arg0Type + , class Arg1Type + , class Arg2Type + , class Arg3Type + , class Arg4Type + , class Arg5Type + , class Arg6Type + , class Arg7Type + > +struct ViewSubview /* { typedef ... type ; } */ ; + +template< class DstViewSpecialize , + class SrcViewSpecialize = void , + class Enable = void > +struct ViewAssignment ; + +template< class DstMemorySpace , class SrcMemorySpace > +struct DeepCopy ; + +} /* namespace Impl */ +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \class ViewTraits + * \brief Traits class for accessing attributes of a View. + * + * This is an implementation detail of View. It is only of interest + * to developers implementing a new specialization of View. + * + * Template argument permutations: + * - View< DataType , void , void , void > + * - View< DataType , Space , void , void > + * - View< DataType , Space , MemoryTraits , void > + * - View< DataType , Space , void , MemoryTraits > + * - View< DataType , ArrayLayout , void , void > + * - View< DataType , ArrayLayout , Space , void > + * - View< DataType , ArrayLayout , MemoryTraits , void > + * - View< DataType , ArrayLayout , Space , MemoryTraits > + * - View< DataType , MemoryTraits , void , void > + */ + +template< class DataType , + class Arg1 = void , + class Arg2 = void , + class Arg3 = void > +class ViewTraits { +private: + + // Layout, Space, and MemoryTraits are optional + // but need to appear in that order. That means Layout + // can only be Arg1, Space can be Arg1 or Arg2, and + // MemoryTraits can be Arg1, Arg2 or Arg3 + + enum { Arg1IsLayout = Impl::is_array_layout<Arg1>::value }; + + enum { Arg1IsSpace = Impl::is_space<Arg1>::value }; + enum { Arg2IsSpace = Impl::is_space<Arg2>::value }; + + enum { Arg1IsMemoryTraits = Impl::is_memory_traits<Arg1>::value }; + enum { Arg2IsMemoryTraits = Impl::is_memory_traits<Arg2>::value }; + enum { Arg3IsMemoryTraits = Impl::is_memory_traits<Arg3>::value }; + + enum { Arg1IsVoid = Impl::is_same< Arg1 , void >::value }; + enum { Arg2IsVoid = Impl::is_same< Arg2 , void >::value }; + enum { Arg3IsVoid = Impl::is_same< Arg3 , void >::value }; + + // Arg1 is Layout, Space, MemoryTraits, or void + typedef typename + Impl::StaticAssert< + ( 1 == Arg1IsLayout + Arg1IsSpace + Arg1IsMemoryTraits + Arg1IsVoid ) + , Arg1 >::type Arg1Verified ; + + // If Arg1 is Layout then Arg2 is Space, MemoryTraits, or void + // If Arg1 is Space then Arg2 is MemoryTraits or void + // If Arg1 is MemoryTraits then Arg2 is void + // If Arg1 is Void then Arg2 is void + typedef typename + Impl::StaticAssert< + ( Arg1IsLayout && ( 1 == Arg2IsSpace + Arg2IsMemoryTraits + Arg2IsVoid ) ) || + ( Arg1IsSpace && ( 0 == Arg2IsSpace ) && ( 1 == Arg2IsMemoryTraits + Arg2IsVoid ) ) || + ( Arg1IsMemoryTraits && Arg2IsVoid ) || + ( Arg1IsVoid && Arg2IsVoid ) + , Arg2 >::type Arg2Verified ; + + // Arg3 is MemoryTraits or void and at most one argument is MemoryTraits + typedef typename + Impl::StaticAssert< + ( 1 == Arg3IsMemoryTraits + Arg3IsVoid ) && + ( Arg1IsMemoryTraits + Arg2IsMemoryTraits + Arg3IsMemoryTraits <= 1 ) + , Arg3 >::type Arg3Verified ; + + // Arg1 or Arg2 may have execution and memory spaces + typedef typename Impl::if_c<( Arg1IsSpace ), Arg1Verified , + typename Impl::if_c<( Arg2IsSpace ), Arg2Verified , + Kokkos::DefaultExecutionSpace + >::type >::type::execution_space ExecutionSpace ; + + typedef typename Impl::if_c<( Arg1IsSpace ), Arg1Verified , + typename Impl::if_c<( Arg2IsSpace ), Arg2Verified , + Kokkos::DefaultExecutionSpace + >::type >::type::memory_space MemorySpace ; + + typedef typename Impl::is_space< + typename Impl::if_c<( Arg1IsSpace ), Arg1Verified , + typename Impl::if_c<( Arg2IsSpace ), Arg2Verified , + Kokkos::DefaultExecutionSpace + >::type >::type >::host_mirror_space HostMirrorSpace ; + + // Arg1 may be array layout + typedef typename Impl::if_c< Arg1IsLayout , Arg1Verified , + typename ExecutionSpace::array_layout + >::type ArrayLayout ; + + // Arg1, Arg2, or Arg3 may be memory traits + typedef typename Impl::if_c< Arg1IsMemoryTraits , Arg1Verified , + typename Impl::if_c< Arg2IsMemoryTraits , Arg2Verified , + typename Impl::if_c< Arg3IsMemoryTraits , Arg3Verified , + MemoryManaged + >::type >::type >::type MemoryTraits ; + + typedef Impl::AnalyzeShape<DataType> analysis ; + +public: + + //------------------------------------ + // Data type traits: + + typedef DataType data_type ; + typedef typename analysis::const_type const_data_type ; + typedef typename analysis::non_const_type non_const_data_type ; + + //------------------------------------ + // Array of intrinsic scalar type traits: + + typedef typename analysis::array_intrinsic_type array_intrinsic_type ; + typedef typename analysis::const_array_intrinsic_type const_array_intrinsic_type ; + typedef typename analysis::non_const_array_intrinsic_type non_const_array_intrinsic_type ; + + //------------------------------------ + // Value type traits: + + typedef typename analysis::value_type value_type ; + typedef typename analysis::const_value_type const_value_type ; + typedef typename analysis::non_const_value_type non_const_value_type ; + + //------------------------------------ + // Layout and shape traits: + + typedef ArrayLayout array_layout ; + typedef typename analysis::shape shape_type ; + + enum { rank = shape_type::rank }; + enum { rank_dynamic = shape_type::rank_dynamic }; + + //------------------------------------ + // Execution space, memory space, memory access traits, and host mirror space. + + typedef ExecutionSpace execution_space ; + typedef MemorySpace memory_space ; + typedef Device<ExecutionSpace,MemorySpace> device_type ; + typedef MemoryTraits memory_traits ; + typedef HostMirrorSpace host_mirror_space ; + + typedef typename memory_space::size_type size_type ; + + enum { is_hostspace = Impl::is_same< memory_space , HostSpace >::value }; + enum { is_managed = memory_traits::Unmanaged == 0 }; + enum { is_random_access = memory_traits::RandomAccess == 1 }; + + //------------------------------------ + + + //------------------------------------ + // Specialization tag: + + typedef typename + Impl::ViewSpecialize< value_type + , typename analysis::specialize + , array_layout + , memory_space + , memory_traits + >::type specialize ; +}; + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +class ViewDefault {}; + +/** \brief Default view specialization has LayoutLeft, LayoutRight, or LayoutStride. + */ +template< class ValueType , class MemorySpace , class MemoryTraits > +struct ViewSpecialize< ValueType , void , LayoutLeft , MemorySpace , MemoryTraits > +{ typedef ViewDefault type ; }; + +template< class ValueType , class MemorySpace , class MemoryTraits > +struct ViewSpecialize< ValueType , void , LayoutRight , MemorySpace , MemoryTraits > +{ typedef ViewDefault type ; }; + +template< class ValueType , class MemorySpace , class MemoryTraits > +struct ViewSpecialize< ValueType , void , LayoutStride , MemorySpace , MemoryTraits > +{ typedef ViewDefault type ; }; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief Types for compile-time detection of View usage errors */ +namespace ViewError { + +struct allocation_constructor_requires_managed {}; +struct allocation_constructor_requires_nonconst {}; +struct user_pointer_constructor_requires_unmanaged {}; +struct device_shmem_constructor_requires_unmanaged {}; + +struct scalar_operator_called_from_non_scalar_view {}; + +} /* namespace ViewError */ + +//---------------------------------------------------------------------------- +/** \brief Enable view parentheses operator for + * match of layout and integral arguments. + * If correct rank define type from traits, + * otherwise define type as an error message. + */ +template< class ReturnType , class Traits , class Layout , unsigned Rank , + typename iType0 = int , typename iType1 = int , + typename iType2 = int , typename iType3 = int , + typename iType4 = int , typename iType5 = int , + typename iType6 = int , typename iType7 = int , + class Enable = void > +struct ViewEnableArrayOper ; + +template< class ReturnType , class Traits , class Layout , unsigned Rank , + typename iType0 , typename iType1 , + typename iType2 , typename iType3 , + typename iType4 , typename iType5 , + typename iType6 , typename iType7 > +struct ViewEnableArrayOper< + ReturnType , Traits , Layout , Rank , + iType0 , iType1 , iType2 , iType3 , + iType4 , iType5 , iType6 , iType7 , + typename enable_if< + iType0(0) == 0 && iType1(0) == 0 && iType2(0) == 0 && iType3(0) == 0 && + iType4(0) == 0 && iType5(0) == 0 && iType6(0) == 0 && iType7(0) == 0 && + is_same< typename Traits::array_layout , Layout >::value && + ( unsigned(Traits::rank) == Rank ) + >::type > +{ + typedef ReturnType type ; +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \class View + * \brief View to an array of data. + * + * A View represents an array of one or more dimensions. + * For details, please refer to Kokkos' tutorial materials. + * + * \section Kokkos_View_TemplateParameters Template parameters + * + * This class has both required and optional template parameters. The + * \c DataType parameter must always be provided, and must always be + * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are + * placeholders for different template parameters. The default value + * of the fifth template parameter \c Specialize suffices for most use + * cases. When explaining the template parameters, we won't refer to + * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer + * to the valid categories of template parameters, in whatever order + * they may occur. + * + * Valid ways in which template arguments may be specified: + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , Space , void , MemoryTraits > + * - View< DataType , Layout , Space > + * - View< DataType , Layout , Space , MemoryTraits > + * + * \tparam DataType (required) This indicates both the type of each + * entry of the array, and the combination of compile-time and + * run-time array dimension(s). For example, <tt>double*</tt> + * indicates a one-dimensional array of \c double with run-time + * dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int + * with run-time first dimension and compile-time second dimension + * (of 3). In general, the run-time dimensions (if any) must go + * first, followed by zero or more compile-time dimensions. For + * more examples, please refer to the tutorial materials. + * + * \tparam Space (required) The memory space. + * + * \tparam Layout (optional) The array's layout in memory. For + * example, LayoutLeft indicates a column-major (Fortran style) + * layout, and LayoutRight a row-major (C style) layout. If not + * specified, this defaults to the preferred layout for the + * <tt>Space</tt>. + * + * \tparam MemoryTraits (optional) Assertion of the user's intended + * access behavior. For example, RandomAccess indicates read-only + * access with limited spatial locality, and Unmanaged lets users + * wrap externally allocated memory in a View without automatic + * deallocation. + * + * \section Kokkos_View_MT MemoryTraits discussion + * + * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on Space + * + * Some \c MemoryTraits options may have different interpretations for + * different \c Space types. For example, with the Cuda device, + * \c RandomAccess tells Kokkos to fetch the data through the texture + * cache, whereas the non-GPU devices have no such hardware construct. + * + * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits + * + * Users should defer applying the optional \c MemoryTraits parameter + * until the point at which they actually plan to rely on it in a + * computational kernel. This minimizes the number of template + * parameters exposed in their code, which reduces the cost of + * compilation. Users may always assign a View without specified + * \c MemoryTraits to a compatible View with that specification. + * For example: + * \code + * // Pass in the simplest types of View possible. + * void + * doSomething (View<double*, Cuda> out, + * View<const double*, Cuda> in) + * { + * // Assign the "generic" View in to a RandomAccess View in_rr. + * // Note that RandomAccess View objects must have const data. + * View<const double*, Cuda, RandomAccess> in_rr = in; + * // ... do something with in_rr and out ... + * } + * \endcode + */ +template< class DataType , + class Arg1Type = void , /* ArrayLayout, SpaceType, or MemoryTraits */ + class Arg2Type = void , /* SpaceType or MemoryTraits */ + class Arg3Type = void , /* MemoryTraits */ + class Specialize = + typename ViewTraits<DataType,Arg1Type,Arg2Type,Arg3Type>::specialize > +class View ; + +namespace Impl { + +template< class C > +struct is_view : public bool_< false > {}; + +template< class D , class A1 , class A2 , class A3 , class S > +struct is_view< View< D , A1 , A2 , A3 , S > > : public bool_< true > {}; + +} + +//---------------------------------------------------------------------------- + +template< class DataType , + class Arg1Type , + class Arg2Type , + class Arg3Type > +class View< DataType , Arg1Type , Arg2Type , Arg3Type , Impl::ViewDefault > + : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > +{ +public: + + typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ; + +private: + + // Assignment of compatible views requirement: + template< class , class , class , class , class > friend class View ; + + // Assignment of compatible subview requirement: + template< class , class , class > friend struct Impl::ViewAssignment ; + + // Dimensions, cardinality, capacity, and offset computation for + // multidimensional array view of contiguous memory. + // Inherits from Impl::Shape + typedef Impl::ViewOffset< typename traits::shape_type + , typename traits::array_layout + > offset_map_type ; + + // Intermediary class for data management and access + typedef Impl::ViewDataManagement< traits > view_data_management ; + + //---------------------------------------- + // Data members: + + typename view_data_management::handle_type m_ptr_on_device ; + offset_map_type m_offset_map ; + view_data_management m_management ; + Impl::AllocationTracker m_tracker ; + + //---------------------------------------- + +public: + + /** return type for all indexing operators */ + typedef typename view_data_management::return_type reference_type ; + + enum { reference_type_is_lvalue = view_data_management::ReturnTypeIsReference }; + + typedef View< typename traits::array_intrinsic_type , + typename traits::array_layout , + typename traits::device_type , + typename traits::memory_traits > array_type ; + + typedef View< typename traits::const_data_type , + typename traits::array_layout , + typename traits::device_type , + typename traits::memory_traits > const_type ; + + typedef View< typename traits::non_const_data_type , + typename traits::array_layout , + typename traits::device_type , + typename traits::memory_traits > non_const_type ; + + typedef View< typename traits::non_const_data_type , + typename traits::array_layout , + typename traits::host_mirror_space , + void > HostMirror ; + + //------------------------------------ + // Shape + + enum { Rank = traits::rank }; + + KOKKOS_INLINE_FUNCTION offset_map_type shape() const { return m_offset_map ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_offset_map.N0 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type size() const { return m_offset_map.cardinality(); } + + template< typename iType > + KOKKOS_INLINE_FUNCTION + typename traits::size_type dimension( const iType & i ) const + { return Impl::dimension( m_offset_map , i ); } + + //------------------------------------ + // Destructor, constructors, assignment operators: + + KOKKOS_INLINE_FUNCTION + ~View() {} + + KOKKOS_INLINE_FUNCTION + View() + : m_ptr_on_device() + , m_offset_map() + , m_management() + , m_tracker() + { m_offset_map.assign(0, 0,0,0,0,0,0,0,0); } + + KOKKOS_INLINE_FUNCTION + View( const View & rhs ) + : m_ptr_on_device() + , m_offset_map() + , m_management() + , m_tracker() + { + (void) Impl::ViewAssignment< + typename traits::specialize , + typename traits::specialize >( *this , rhs ); + } + + KOKKOS_INLINE_FUNCTION + View & operator = ( const View & rhs ) + { + (void) Impl::ViewAssignment< + typename traits::specialize , + typename traits::specialize >( *this , rhs ); + return *this ; + } + + //------------------------------------ + // Construct or assign compatible view: + + template< class RT , class RL , class RD , class RM , class RS > + KOKKOS_INLINE_FUNCTION + View( const View<RT,RL,RD,RM,RS> & rhs ) + : m_ptr_on_device() + , m_offset_map() + , m_management() + , m_tracker() + { + (void) Impl::ViewAssignment< + typename traits::specialize , RS >( *this , rhs ); + } + + template< class RT , class RL , class RD , class RM , class RS > + KOKKOS_INLINE_FUNCTION + View & operator = ( const View<RT,RL,RD,RM,RS> & rhs ) + { + (void) Impl::ViewAssignment< + typename traits::specialize , RS >( *this , rhs ); + return *this ; + } + + //------------------------------------ + /**\brief Allocation of a managed view with possible alignment padding. + * + * Allocation properties for allocating and initializing to the default value_type: + * Kokkos::ViewAllocate() + * Kokkos::ViewAllocate("label") OR "label" + * Kokkos::ViewAllocate(std::string("label")) OR std::string("label") + * + * Allocation properties for allocating and bypassing initialization: + * Kokkos::ViewAllocateWithoutInitializing() + * Kokkos::ViewAllocateWithoutInitializing("label") + */ + + template< class AllocationProperties > + explicit inline + View( const AllocationProperties & prop , + // Impl::ViewAllocProp::size_type exists when the traits and allocation properties + // are valid for allocating viewed memory. + const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type n0 = 0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 , + const size_t n8 = 0 ) + : m_ptr_on_device() + , m_offset_map() + , m_management() + , m_tracker() + { + typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ; + + static_assert(!std::is_same<typename traits::array_layout, LayoutStride>::value, + "LayoutStride does not support View constructor which takes dimensions directly!"); + + m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 ); + if(Alloc::AllowPadding) + m_offset_map.set_padding(); + + m_ptr_on_device = view_data_management::template allocate< Alloc::Initialize >( Alloc::label(prop) , m_offset_map, m_tracker ); + + } + + template< class AllocationProperties > + explicit inline + View( const AllocationProperties & prop , + const typename traits::array_layout & layout , + // Impl::ViewAllocProp::size_type exists when the traits and allocation properties + // are valid for allocating viewed memory. + const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type = 0 ) + : m_ptr_on_device() + , m_offset_map() + , m_management() + , m_tracker() + { + typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ; + + m_offset_map.assign( layout ); + if(Alloc::AllowPadding) + m_offset_map.set_padding(); + + m_ptr_on_device = view_data_management::template allocate< Alloc::Initialize >( Alloc::label(prop) , m_offset_map, m_tracker ); + + m_management.set_noncontiguous(); + } + + //------------------------------------ + // Assign an unmanaged View from pointer, can be called in functors. + // No alignment padding is performed. + + template< class Type > + explicit KOKKOS_INLINE_FUNCTION + View( Type * ptr , + typename Impl::ViewRawPointerProp< traits , Type >::size_type n0 = 0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 , + const size_t n8 = 0 ) + : m_ptr_on_device(ptr) + , m_offset_map() + , m_management() + , m_tracker() + { + m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 ); + m_management.set_unmanaged(); + } + + template< class Type > + explicit KOKKOS_INLINE_FUNCTION + View( Type * ptr , + typename traits::array_layout const & layout , + typename Impl::ViewRawPointerProp< traits , Type >::size_type = 0 ) + : m_ptr_on_device(ptr) + , m_offset_map() + , m_management() + , m_tracker() + { + m_offset_map.assign( layout ); + m_management.set_unmanaged(); + m_management.set_noncontiguous(); + } + + + + //------------------------------------ + // Assign a View from an AllocationTracker, + // The allocator used must be compatiable with the memory space of the view + // No alignment padding is performed. + // TODO: Should these allow padding??? DJS 01/15/15 + explicit + View( Impl::AllocationTracker const &arg_tracker , + const size_t n0 = 0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 , + const size_t n8 = 0 ) + : m_ptr_on_device(reinterpret_cast<typename traits::value_type*>(arg_tracker.alloc_ptr())) + , m_offset_map() + , m_management() + , m_tracker(arg_tracker) + { + m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 ); + + const size_t req_size = m_offset_map.capacity() * sizeof(typename traits::value_type); + if ( m_tracker.alloc_size() < req_size ) { + Impl::throw_runtime_exception("Error: tracker.alloc_size() < req_size"); + } + } + + explicit + View( Impl::AllocationTracker const & arg_tracker + , typename traits::array_layout const & layout ) + : m_ptr_on_device(reinterpret_cast<typename traits::value_type*>(arg_tracker.alloc_ptr())) + , m_offset_map() + , m_management() + , m_tracker(arg_tracker) + { + m_offset_map.assign( layout ); + + const size_t req_size = m_offset_map.capacity() * sizeof(typename traits::value_type); + if ( m_tracker.alloc_size() < req_size ) { + Impl::throw_runtime_exception("Error: tracker.alloc_size() < req_size"); + } + + m_management.set_noncontiguous(); + } + + //------------------------------------ + /** \brief Constructors for subviews requires following + * type-compatibility condition, enforce via StaticAssert. + * + * Impl::is_same< View , + * typename Impl::ViewSubview< View<D,A1,A2,A3,Impl::ViewDefault> + * , ArgType0 , ArgType1 , ArgType2 , ArgType3 + * , ArgType4 , ArgType5 , ArgType6 , ArgType7 + * >::type >::value + */ + template< class D , class A1 , class A2 , class A3 + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type + > + KOKKOS_INLINE_FUNCTION + View( const View<D,A1,A2,A3,Impl::ViewDefault> & src + , const SubArg0_type & arg0 , const SubArg1_type & arg1 + , const SubArg2_type & arg2 , const SubArg3_type & arg3 + , const SubArg4_type & arg4 , const SubArg5_type & arg5 + , const SubArg6_type & arg6 , const SubArg7_type & arg7 + ); + + template< class D , class A1 , class A2 , class A3 + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + , class SubArg4_type , class SubArg5_type , class SubArg6_type + > + KOKKOS_INLINE_FUNCTION + View( const View<D,A1,A2,A3,Impl::ViewDefault> & src + , const SubArg0_type & arg0 , const SubArg1_type & arg1 + , const SubArg2_type & arg2 , const SubArg3_type & arg3 + , const SubArg4_type & arg4 , const SubArg5_type & arg5 + , const SubArg6_type & arg6 + ); + + template< class D , class A1 , class A2 , class A3 + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + , class SubArg4_type , class SubArg5_type + > + KOKKOS_INLINE_FUNCTION + View( const View<D,A1,A2,A3,Impl::ViewDefault> & src + , const SubArg0_type & arg0 , const SubArg1_type & arg1 + , const SubArg2_type & arg2 , const SubArg3_type & arg3 + , const SubArg4_type & arg4 , const SubArg5_type & arg5 + ); + + template< class D , class A1 , class A2 , class A3 + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + , class SubArg4_type + > + KOKKOS_INLINE_FUNCTION + View( const View<D,A1,A2,A3,Impl::ViewDefault> & src + , const SubArg0_type & arg0 , const SubArg1_type & arg1 + , const SubArg2_type & arg2 , const SubArg3_type & arg3 + , const SubArg4_type & arg4 + ); + + template< class D , class A1 , class A2 , class A3 + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + > + KOKKOS_INLINE_FUNCTION + View( const View<D,A1,A2,A3,Impl::ViewDefault> & src + , const SubArg0_type & arg0 , const SubArg1_type & arg1 + , const SubArg2_type & arg2 , const SubArg3_type & arg3 + ); + + template< class D , class A1 , class A2 , class A3 + , class SubArg0_type , class SubArg1_type , class SubArg2_type + > + KOKKOS_INLINE_FUNCTION + View( const View<D,A1,A2,A3,Impl::ViewDefault> & src + , const SubArg0_type & arg0 , const SubArg1_type & arg1 + , const SubArg2_type & arg2 + ); + + template< class D , class A1 , class A2 , class A3 + , class SubArg0_type , class SubArg1_type + > + KOKKOS_INLINE_FUNCTION + View( const View<D,A1,A2,A3,Impl::ViewDefault> & src + , const SubArg0_type & arg0 , const SubArg1_type & arg1 + ); + + template< class D , class A1 , class A2 , class A3 + , class SubArg0_type + > + KOKKOS_INLINE_FUNCTION + View( const View<D,A1,A2,A3,Impl::ViewDefault> & src + , const SubArg0_type & arg0 + ); + + //------------------------------------ + // Assign unmanaged View to portion of execution space's shared memory + + typedef Impl::if_c< ! traits::is_managed , + const typename traits::execution_space::scratch_memory_space & , + Impl::ViewError::device_shmem_constructor_requires_unmanaged > + if_scratch_memory_constructor ; + + explicit KOKKOS_INLINE_FUNCTION + View( typename if_scratch_memory_constructor::type space , + const unsigned n0 = 0 , + const unsigned n1 = 0 , + const unsigned n2 = 0 , + const unsigned n3 = 0 , + const unsigned n4 = 0 , + const unsigned n5 = 0 , + const unsigned n6 = 0 , + const unsigned n7 = 0 ) + : m_ptr_on_device() + , m_offset_map() + , m_management() + , m_tracker() + { + typedef typename traits::value_type value_type_ ; + + enum { align = 8 }; + enum { mask = align - 1 }; + + m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); + + typedef Impl::if_c< ! traits::is_managed , + value_type_ * , + Impl::ViewError::device_shmem_constructor_requires_unmanaged > + if_device_shmem_pointer ; + + // Select the first argument: + m_ptr_on_device = if_device_shmem_pointer::select( + (value_type_*) space.get_shmem( unsigned( sizeof(value_type_) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) ); + } + + explicit KOKKOS_INLINE_FUNCTION + View( typename if_scratch_memory_constructor::type space , + typename traits::array_layout const & layout) + : m_ptr_on_device() + , m_offset_map() + , m_management() + , m_tracker() + { + typedef typename traits::value_type value_type_ ; + + typedef Impl::if_c< ! traits::is_managed , + value_type_ * , + Impl::ViewError::device_shmem_constructor_requires_unmanaged > + if_device_shmem_pointer ; + + m_offset_map.assign( layout ); + m_management.set_unmanaged(); + m_management.set_noncontiguous(); + + enum { align = 8 }; + enum { mask = align - 1 }; + + // Select the first argument: + m_ptr_on_device = if_device_shmem_pointer::select( + (value_type_*) space.get_shmem( unsigned( sizeof(value_type_) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) ); + } + + static inline + unsigned shmem_size( const unsigned n0 = 0 , + const unsigned n1 = 0 , + const unsigned n2 = 0 , + const unsigned n3 = 0 , + const unsigned n4 = 0 , + const unsigned n5 = 0 , + const unsigned n6 = 0 , + const unsigned n7 = 0 ) + { + enum { align = 8 }; + enum { mask = align - 1 }; + + typedef typename traits::value_type value_type_ ; + + offset_map_type offset_map ; + + offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); + + return unsigned( sizeof(value_type_) * offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ; + } + + //------------------------------------ + // Is not allocated + + KOKKOS_FORCEINLINE_FUNCTION + bool is_null() const { return 0 == ptr_on_device() ; } + + //------------------------------------ + // Operators for scalar (rank zero) views. + + typedef Impl::if_c< traits::rank == 0 , + typename traits::value_type , + Impl::ViewError::scalar_operator_called_from_non_scalar_view > + if_scalar_operator ; + + KOKKOS_INLINE_FUNCTION + const View & operator = ( const typename if_scalar_operator::type & rhs ) const + { + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + *m_ptr_on_device = if_scalar_operator::select( rhs ); + return *this ; + } + + KOKKOS_FORCEINLINE_FUNCTION + operator typename if_scalar_operator::type & () const + { + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + return if_scalar_operator::select( *m_ptr_on_device ); + } + + KOKKOS_FORCEINLINE_FUNCTION + typename if_scalar_operator::type & operator()() const + { + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + return if_scalar_operator::select( *m_ptr_on_device ); + } + + KOKKOS_FORCEINLINE_FUNCTION + typename if_scalar_operator::type & operator*() const + { + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + return if_scalar_operator::select( *m_ptr_on_device ); + } + + //------------------------------------ + // Array member access operators enabled if + // (1) a zero value of all argument types are compile-time comparable to zero + // (2) the rank matches the number of arguments + // (3) the memory space is valid for the access + //------------------------------------ + // rank 1: + // Specialisation for LayoutLeft and LayoutRight since we know its stride 1 + + template< typename iType0 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutLeft, 1, iType0 >::type + operator[] ( const iType0 & i0 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ i0 ]; + } + + template< typename iType0 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutLeft, 1, iType0 >::type + operator() ( const iType0 & i0 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ i0 ]; + } + + template< typename iType0 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutLeft, 1, iType0 >::type + at( const iType0 & i0 , const int , const int , const int , + const int , const int , const int , const int ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ i0 ]; + } + + template< typename iType0 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutRight, 1, iType0 >::type + operator[] ( const iType0 & i0 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ i0 ]; + } + + template< typename iType0 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutRight, 1, iType0 >::type + operator() ( const iType0 & i0 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ i0 ]; + } + + template< typename iType0 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutRight, 1, iType0 >::type + at( const iType0 & i0 , const int , const int , const int , + const int , const int , const int , const int ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ i0 ]; + } + + template< typename iType0 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , traits, + typename Impl::if_c< + Impl::is_same<typename traits::array_layout, LayoutRight>::value || + Impl::is_same<typename traits::array_layout, LayoutLeft>::value , + void, typename traits::array_layout>::type, + 1, iType0 >::type + operator[] ( const iType0 & i0 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0) ]; + } + + template< typename iType0 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , traits, + typename Impl::if_c< + Impl::is_same<typename traits::array_layout, LayoutRight>::value || + Impl::is_same<typename traits::array_layout, LayoutLeft>::value , + void, typename traits::array_layout>::type, + 1, iType0 >::type + operator() ( const iType0 & i0 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0) ]; + } + + template< typename iType0 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , traits, + typename Impl::if_c< + Impl::is_same<typename traits::array_layout, LayoutRight>::value || + Impl::is_same<typename traits::array_layout, LayoutLeft>::value , + void, typename traits::array_layout>::type, + 1, iType0 >::type + at( const iType0 & i0 , const int , const int , const int , + const int , const int , const int , const int ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0) ]; + } + + // rank 2: + + template< typename iType0 , typename iType1 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 2, iType0, iType1 >::type + operator() ( const iType0 & i0 , const iType1 & i1 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1) ]; + } + + template< typename iType0 , typename iType1 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 2, iType0, iType1 >::type + at( const iType0 & i0 , const iType1 & i1 , const int , const int , + const int , const int , const int , const int ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1) ]; + } + + // rank 3: + + template< typename iType0 , typename iType1 , typename iType2 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2) ]; + } + + template< typename iType0 , typename iType1 , typename iType2 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type + at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const int , + const int , const int , const int , const int ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2) ]; + } + + // rank 4: + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3) ]; + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type + at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const int , const int , const int , const int ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3) ]; + } + + // rank 5: + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3 , iType4 >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4) ]; + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3 , iType4 >::type + at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 , const int , const int , const int ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4) ]; + } + + // rank 6: + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 , typename iType5 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 6, + iType0, iType1, iType2, iType3 , iType4, iType5 >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 , const iType5 & i5 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5) ]; + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 , typename iType5 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 6, + iType0, iType1, iType2, iType3 , iType4, iType5 >::type + at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 , const iType5 & i5 , const int , const int ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5) ]; + } + + // rank 7: + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 , typename iType5 , typename iType6 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 7, + iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ]; + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 , typename iType5 , typename iType6 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 7, + iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type + at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const int ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ]; + } + + // rank 8: + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 , typename iType5 , typename iType6 , typename iType7 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 8, + iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type + operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ]; + } + + template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , + typename iType4 , typename iType5 , typename iType6 , typename iType7 > + KOKKOS_FORCEINLINE_FUNCTION + typename Impl::ViewEnableArrayOper< reference_type , + traits, typename traits::array_layout, 8, + iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type + at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , + const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() ); + + return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ]; + } + + //------------------------------------ + // Access to the underlying contiguous storage of this view specialization. + // These methods are specific to specialization of a view. + + KOKKOS_FORCEINLINE_FUNCTION + typename traits::value_type * ptr_on_device() const + { return (typename traits::value_type *) m_ptr_on_device ; } + + // Stride of physical storage, dimensioned to at least Rank + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { m_offset_map.stride(s); } + + // Count of contiguously allocated data members including padding. + KOKKOS_INLINE_FUNCTION + typename traits::size_type capacity() const + { return m_offset_map.capacity(); } + + // If the view data can be treated (deep copied) + // as a contiguous block of memory. + KOKKOS_INLINE_FUNCTION + bool is_contiguous() const + { return m_management.is_contiguous(); } + + const Impl::AllocationTracker & tracker() const { return m_tracker; } +}; + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template< class LT , class LL , class LD , class LM , class LS , + class RT , class RL , class RD , class RM , class RS > +KOKKOS_INLINE_FUNCTION +typename Impl::enable_if<( Impl::is_same< LS , RS >::value ), bool >::type +operator == ( const View<LT,LL,LD,LM,LS> & lhs , + const View<RT,RL,RD,RM,RS> & rhs ) +{ + // Same data, layout, dimensions + typedef ViewTraits<LT,LL,LD,LM> lhs_traits ; + typedef ViewTraits<RT,RL,RD,RM> rhs_traits ; + + return + Impl::is_same< typename lhs_traits::const_data_type , + typename rhs_traits::const_data_type >::value && + Impl::is_same< typename lhs_traits::array_layout , + typename rhs_traits::array_layout >::value && + Impl::is_same< typename lhs_traits::memory_space , + typename rhs_traits::memory_space >::value && + Impl::is_same< typename lhs_traits::specialize , + typename rhs_traits::specialize >::value && + lhs.ptr_on_device() == rhs.ptr_on_device() && + lhs.shape() == rhs.shape() ; +} + +template< class LT , class LL , class LD , class LM , class LS , + class RT , class RL , class RD , class RM , class RS > +KOKKOS_INLINE_FUNCTION +bool operator != ( const View<LT,LL,LD,LM,LS> & lhs , + const View<RT,RL,RD,RM,RS> & rhs ) +{ + return ! operator==( lhs , rhs ); +} + +//---------------------------------------------------------------------------- + + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +//---------------------------------------------------------------------------- +/** \brief Deep copy a value into a view. + */ +template< class DT , class DL , class DD , class DM , class DS > +inline +void deep_copy( const View<DT,DL,DD,DM,DS> & dst , + typename Impl::enable_if<( + Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::non_const_value_type , + typename ViewTraits<DT,DL,DD,DM>::value_type >::value + ), typename ViewTraits<DT,DL,DD,DM>::const_value_type >::type & value ) +{ + Impl::ViewFill< View<DT,DL,DD,DM,DS> >( dst , value ); +} + +template< class ST , class SL , class SD , class SM , class SS > +inline +typename Impl::enable_if<( ViewTraits<ST,SL,SD,SM>::rank == 0 )>::type +deep_copy( ST & dst , const View<ST,SL,SD,SM,SS> & src ) +{ + typedef ViewTraits<ST,SL,SD,SM> src_traits ; + typedef typename src_traits::memory_space src_memory_space ; + Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.ptr_on_device() , sizeof(ST) ); +} + +//---------------------------------------------------------------------------- +/** \brief A deep copy between views of compatible type, and rank zero. + */ +template< class DT , class DL , class DD , class DM , class DS , + class ST , class SL , class SD , class SM , class SS > +inline +void deep_copy( const View<DT,DL,DD,DM,DS> & dst , + const View<ST,SL,SD,SM,SS> & src , + typename Impl::enable_if<( + // Same type and destination is not constant: + Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type , + typename View<ST,SL,SD,SM,SS>::non_const_value_type >::value + && + // Rank zero: + ( unsigned(View<DT,DL,DD,DM,DS>::rank) == unsigned(0) ) && + ( unsigned(View<ST,SL,SD,SM,SS>::rank) == unsigned(0) ) + )>::type * = 0 ) +{ + typedef View<DT,DL,DD,DM,DS> dst_type ; + typedef View<ST,SL,SD,SM,SS> src_type ; + + typedef typename dst_type::memory_space dst_memory_space ; + typedef typename src_type::memory_space src_memory_space ; + typedef typename src_type::value_type value_type ; + + if ( dst.ptr_on_device() != src.ptr_on_device() ) { + Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , sizeof(value_type) ); + } +} + +//---------------------------------------------------------------------------- +/** \brief A deep copy between views of the default specialization, compatible type, + * same non-zero rank, same contiguous layout. + */ +template< class DT , class DL , class DD , class DM , + class ST , class SL , class SD , class SM > +inline +void deep_copy( const View<DT,DL,DD,DM,Impl::ViewDefault> & dst , + const View<ST,SL,SD,SM,Impl::ViewDefault> & src , + typename Impl::enable_if<( + // Same type and destination is not constant: + Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::value_type , + typename View<ST,SL,SD,SM,Impl::ViewDefault>::non_const_value_type >::value + && + // Same non-zero rank: + ( unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) == + unsigned(View<ST,SL,SD,SM,Impl::ViewDefault>::rank) ) + && + ( 0 < unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) ) + && + // Same layout: + Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , + typename View<ST,SL,SD,SM,Impl::ViewDefault>::array_layout >::value + )>::type * = 0 ) +{ + typedef View<DT,DL,DD,DM,Impl::ViewDefault> dst_type ; + typedef View<ST,SL,SD,SM,Impl::ViewDefault> src_type ; + + typedef typename dst_type::memory_space dst_memory_space ; + typedef typename src_type::memory_space src_memory_space ; + + enum { is_contiguous = // Contiguous (e.g., non-strided, non-tiled) layout + Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutLeft >::value || + Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutRight >::value }; + + if ( dst.ptr_on_device() != src.ptr_on_device() ) { + + // Same shape (dimensions) + + const bool shapes_are_equal = dst.shape() == src.shape(); + + if ( shapes_are_equal && is_contiguous && dst.capacity() == src.capacity() ) { + + // Views span equal length contiguous range. + // Assuming can perform a straight memory copy over this range. + + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity(); + + Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes ); + } + else { + // Destination view's execution space must be able to directly access source memory space + // in order for the ViewRemap functor run in the destination memory space's execution space. + size_t stride[8]; + src.stride(stride); + size_t size_stride = stride[0]*src.dimension_0(); + size_t size_dim = src.dimension_0(); + for(int i = 1; i<src.rank; i++) { + if(stride[i]*src.dimension(i)>size_stride) + size_stride = stride[i]*src.dimension(i); + size_dim*=src.dimension(i); + } + + if( shapes_are_equal && size_stride == size_dim) { + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity(); + + Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes ); + } else { + Impl::ViewRemap< dst_type , src_type >( dst , src ); + } + } + } +} + + +/** \brief Deep copy equal dimension arrays in the same space which + * have different layouts or specializations. + */ +template< class DT , class DL , class DD , class DM , class DS , + class ST , class SL , class SD , class SM , class SS > +inline +void deep_copy( const View< DT, DL, DD, DM, DS > & dst , + const View< ST, SL, SD, SM, SS > & src , + const typename Impl::enable_if<( + // Same type and destination is not constant: + Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type , + typename View<DT,DL,DD,DM,DS>::non_const_value_type >::value + && + // Source memory space is accessible to destination memory space + Impl::VerifyExecutionCanAccessMemorySpace< typename View<DT,DL,DD,DM,DS>::memory_space + , typename View<ST,SL,SD,SM,SS>::memory_space >::value + && + // Same non-zero rank + ( unsigned( View<DT,DL,DD,DM,DS>::rank ) == + unsigned( View<ST,SL,SD,SM,SS>::rank ) ) + && + ( 0 < unsigned( View<DT,DL,DD,DM,DS>::rank ) ) + && + // Different layout or different specialization: + ( ( ! Impl::is_same< typename View<DT,DL,DD,DM,DS>::array_layout , + typename View<ST,SL,SD,SM,SS>::array_layout >::value ) + || + ( ! Impl::is_same< DS , SS >::value ) + ) + )>::type * = 0 ) +{ + typedef View< DT, DL, DD, DM, DS > dst_type ; + typedef View< ST, SL, SD, SM, SS > src_type ; + + assert_shapes_equal_dimension( dst.shape() , src.shape() ); + + Impl::ViewRemap< dst_type , src_type >( dst , src ); +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template< class T , class L , class D , class M , class S > +typename Impl::enable_if<( + View<T,L,D,M,S>::is_managed && + !Impl::is_same<L,LayoutStride>::value + ), typename View<T,L,D,M,S>::HostMirror >::type +inline +create_mirror( const View<T,L,D,M,S> & src ) +{ + typedef View<T,L,D,M,S> view_type ; + typedef typename view_type::HostMirror host_view_type ; + + // 'view' is managed therefore we can allocate a + // compatible host_view through the ordinary constructor. + + std::string label = src.tracker().label(); + label.append("_mirror"); + + return host_view_type( label , + src.dimension_0() , + src.dimension_1() , + src.dimension_2() , + src.dimension_3() , + src.dimension_4() , + src.dimension_5() , + src.dimension_6() , + src.dimension_7() ); +} + +template< class T , class L , class D , class M , class S > +typename Impl::enable_if<( + View<T,L,D,M,S>::is_managed && + Impl::is_same<L,LayoutStride>::value + ), typename View<T,L,D,M,S>::HostMirror >::type +inline +create_mirror( const View<T,L,D,M,S> & src ) +{ + typedef View<T,L,D,M,S> view_type ; + typedef typename view_type::HostMirror host_view_type ; + + // 'view' is managed therefore we can allocate a + // compatible host_view through the ordinary constructor. + + std::string label = src.tracker().label(); + label.append("_mirror"); + LayoutStride layout; + src.stride(layout.stride); + layout.dimension[0] = src.dimension_0(); + layout.dimension[1] = src.dimension_1(); + layout.dimension[2] = src.dimension_2(); + layout.dimension[3] = src.dimension_3(); + layout.dimension[4] = src.dimension_4(); + layout.dimension[5] = src.dimension_5(); + layout.dimension[6] = src.dimension_6(); + layout.dimension[7] = src.dimension_7(); + + return host_view_type( label , layout ); +} +template< class T , class L , class D , class M , class S > +typename Impl::enable_if<( + View<T,L,D,M,S>::is_managed && + Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value + ), typename View<T,L,D,M,S>::HostMirror >::type +inline +create_mirror_view( const View<T,L,D,M,S> & src ) +{ + return src ; +} + +template< class T , class L , class D , class M , class S > +typename Impl::enable_if<( + View<T,L,D,M,S>::is_managed && + ! Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value + ), typename View<T,L,D,M,S>::HostMirror >::type +inline +create_mirror_view( const View<T,L,D,M,S> & src ) +{ + return create_mirror( src ); +} + +//---------------------------------------------------------------------------- + +/** \brief Resize a view with copying old data to new data at the corresponding indices. */ +template< class T , class L , class D , class M , class S > +inline +void resize( View<T,L,D,M,S> & v , + const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 ) +{ + typedef View<T,L,D,M,S> view_type ; + + const std::string label = v.tracker().label(); + + view_type v_resized( label, n0, n1, n2, n3, n4, n5, n6, n7 ); + + Impl::ViewRemap< view_type , view_type >( v_resized , v ); + + v = v_resized ; +} + +/** \brief Reallocate a view without copying old data to new data */ +template< class T , class L , class D , class M , class S > +inline +void realloc( View<T,L,D,M,S> & v , + const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 ) +{ + typedef View<T,L,D,M,S> view_type ; + + // Query the current label and reuse it. + const std::string label = v.tracker().label(); + + v = view_type(); // deallocate first, if the only view to memory. + v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 ); +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \brief Tag denoting that a subview should capture all of a dimension */ +struct ALL { KOKKOS_INLINE_FUNCTION ALL(){} }; + +template< class D , class A1 , class A2 , class A3 , class S , + class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , + class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 > +KOKKOS_INLINE_FUNCTION +typename Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , ArgType6 , ArgType7 + >::type +subview( const View<D,A1,A2,A3,S> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 , + const ArgType3 & arg3 , + const ArgType4 & arg4 , + const ArgType5 & arg5 , + const ArgType6 & arg6 , + const ArgType7 & arg7 ) +{ + typedef typename + Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , ArgType6 , ArgType7 + >::type + DstViewType ; + + return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 ); +} + +template< class D , class A1 , class A2 , class A3 , class S , + class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , + class ArgType4 , class ArgType5 , class ArgType6 > +KOKKOS_INLINE_FUNCTION +typename Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , ArgType6 , void + >::type +subview( const View<D,A1,A2,A3,S> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 , + const ArgType3 & arg3 , + const ArgType4 & arg4 , + const ArgType5 & arg5 , + const ArgType6 & arg6 ) +{ + typedef typename + Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , ArgType6 , void + >::type + DstViewType ; + + return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5, arg6 ); +} + +template< class D , class A1 , class A2 , class A3 , class S , + class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , + class ArgType4 , class ArgType5 > +KOKKOS_INLINE_FUNCTION +typename Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , void , void + >::type +subview( const View<D,A1,A2,A3,S> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 , + const ArgType3 & arg3 , + const ArgType4 & arg4 , + const ArgType5 & arg5 ) +{ + typedef typename + Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , ArgType5 , void , void + >::type + DstViewType ; + + return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5 ); +} + +template< class D , class A1 , class A2 , class A3 , class S , + class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , + class ArgType4 > +KOKKOS_INLINE_FUNCTION +typename Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , void , void , void + >::type +subview( const View<D,A1,A2,A3,S> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 , + const ArgType3 & arg3 , + const ArgType4 & arg4 ) +{ + typedef typename + Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , ArgType4 , void , void , void + >::type + DstViewType ; + + return DstViewType( src, arg0, arg1, arg2, arg3, arg4 ); +} + +template< class D , class A1 , class A2 , class A3 , class S , + class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 > +KOKKOS_INLINE_FUNCTION +typename Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , void , void , void , void + >::type +subview( const View<D,A1,A2,A3,S> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 , + const ArgType3 & arg3 ) +{ + typedef typename + Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , ArgType3 + , void , void , void , void + >::type + DstViewType ; + + return DstViewType( src, arg0, arg1, arg2, arg3 ); +} + +template< class D , class A1 , class A2 , class A3 , class S , + class ArgType0 , class ArgType1 , class ArgType2 > +KOKKOS_INLINE_FUNCTION +typename Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , void + , void , void , void , void + >::type +subview( const View<D,A1,A2,A3,S> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 , + const ArgType2 & arg2 ) +{ + typedef typename + Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , ArgType2 , void + , void , void , void , void + >::type + DstViewType ; + + return DstViewType( src, arg0, arg1, arg2 ); +} + +template< class D , class A1 , class A2 , class A3 , class S , + class ArgType0 , class ArgType1 > +KOKKOS_INLINE_FUNCTION +typename Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , void , void + , void , void , void , void + >::type +subview( const View<D,A1,A2,A3,S> & src , + const ArgType0 & arg0 , + const ArgType1 & arg1 ) +{ + typedef typename + Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , ArgType1 , void , void + , void , void , void , void + >::type + DstViewType ; + + return DstViewType( src, arg0, arg1 ); +} + +template< class D , class A1 , class A2 , class A3 , class S , + class ArgType0 > +KOKKOS_INLINE_FUNCTION +typename Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , void , void , void + , void , void , void , void + >::type +subview( const View<D,A1,A2,A3,S> & src , + const ArgType0 & arg0 ) +{ + typedef typename + Impl::ViewSubview< View<D,A1,A2,A3,S> + , ArgType0 , void , void , void + , void , void , void , void + >::type + DstViewType ; + + return DstViewType( src, arg0 ); +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include <impl/Kokkos_ViewDefault.hpp> +#include <impl/Kokkos_Atomic_View.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#else + +#include <impl/Kokkos_ViewOffset.hpp> +#include <impl/Kokkos_ViewSupport.hpp> + +#endif /* #if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */ + +#include <KokkosExp_View.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif + diff --git a/lib/kokkos/core/src/Kokkos_hwloc.hpp b/lib/kokkos/core/src/Kokkos_hwloc.hpp new file mode 100755 index 0000000000000000000000000000000000000000..a0b007f64274e5177e34568c02caf75368087045 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_hwloc.hpp @@ -0,0 +1,140 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HWLOC_HPP +#define KOKKOS_HWLOC_HPP + +#include <utility> + +namespace Kokkos { + +/** \brief Minimal subset of logical 'hwloc' functionality available + * from http://www.open-mpi.org/projects/hwloc/. + * + * The calls are NOT thread safe in order to avoid mutexes, + * memory allocations, or other actions which could give the + * runtime system an opportunity to migrate the threads or + * touch allocated memory during the function calls. + * + * All calls to these functions should be performed by a thread + * when it has guaranteed exclusive access; e.g., for OpenMP + * within a 'critical' region. + */ +namespace hwloc { + +/** \brief Query if hwloc is available */ +bool available(); + +/** \brief Query number of available NUMA regions. + * This will be less than the hardware capacity + * if the MPI process is pinned to a NUMA region. + */ +unsigned get_available_numa_count(); + +/** \brief Query number of available cores per NUMA regions. + * This will be less than the hardware capacity + * if the MPI process is pinned to a set of cores. + */ +unsigned get_available_cores_per_numa(); + +/** \brief Query number of available "hard" threads per core; i.e., hyperthreads */ +unsigned get_available_threads_per_core(); + +} /* namespace hwloc */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// Internal functions for binding persistent spawned threads. + +namespace Kokkos { +namespace hwloc { + +/** \brief Recommend mapping of threads onto cores. + * + * If thread_count == 0 then choose and set a value. + * If use_numa_count == 0 then choose and set a value. + * If use_cores_per_numa == 0 then choose and set a value. + * + * Return 0 if asynchronous, + * Return 1 if synchronous and threads_coord[0] is process core + */ +unsigned thread_mapping( const char * const label , + const bool allow_async , + unsigned & thread_count , + unsigned & use_numa_count , + unsigned & use_cores_per_numa , + std::pair<unsigned,unsigned> threads_coord[] ); + +/** \brief Query core-coordinate of the current thread + * with respect to the core_topology. + * + * As long as the thread is running within the + * process binding the following condition holds. + * + * core_coordinate.first < core_topology.first + * core_coordinate.second < core_topology.second + */ +std::pair<unsigned,unsigned> get_this_thread_coordinate(); + +/** \brief Bind the current thread to a core. */ +bool bind_this_thread( const std::pair<unsigned,unsigned> ); + +/** \brief Bind the current thread to one of the cores in the list. + * Set that entry to (~0,~0) and return the index. + * If binding fails return ~0. + */ +unsigned bind_this_thread( const unsigned coordinate_count , + std::pair<unsigned,unsigned> coordinate[] ); + +/** \brief Unbind the current thread back to the original process binding */ +bool unbind_this_thread(); + +} /* namespace hwloc */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #define KOKKOS_HWLOC_HPP */ + diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..24d8e465ff96a7583cb487c5514ef4c235980232 --- /dev/null +++ b/lib/kokkos/core/src/Makefile @@ -0,0 +1,118 @@ +KOKKOS_PATH = ../.. + +PREFIX ?= /usr/local/lib/kokkos + +default: messages build-lib + echo "End Build" + + +include $(KOKKOS_PATH)/Makefile.kokkos + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + CXX = nvcc_wrapper + CXXFLAGS ?= -O3 + LINK = nvcc_wrapper + LINKFLAGS ?= +else + CXX ?= g++ + CXXFLAGS ?= -O3 + LINK ?= g++ + LINKFLAGS ?= +endif + +PWD = $(shell pwd) + +KOKKOS_HEADERS_INCLUDE = $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) +KOKKOS_HEADERS_INCLUDE_IMPL = $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) +KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) +KOKKOS_HEADERS_INCLUDE_IMPL += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp) +KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp) + +CONDITIONAL_COPIES = + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp) + CONDITIONAL_COPIES += copy-cuda +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp) + CONDITIONAL_COPIES += copy-threads +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp) + CONDITIONAL_COPIES += copy-openmp +endif + +messages: + echo "Start Build" + +build-makefile-kokkos: + rm -f Makefile.kokkos + echo "#Global Settings used to generate this library" >> Makefile.kokkos + echo "KOKKOS_PATH = $(PREFIX)" >> Makefile.kokkos + echo "KOKKOS_DEVICES = $(KOKKOS_DEVICES)" >> Makefile.kokkos + echo "KOKKOS_ARCH = $(KOKKOS_ARCH)" >> Makefile.kokkos + echo "KOKKOS_DEBUG = $(KOKKOS_DEBUG)" >> Makefile.kokkos + echo "KOKKOS_USE_TPLS = $(KOKKOS_USE_TPLS)" >> Makefile.kokkos + echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos + echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos + echo "CXX ?= $(CXX)" >> Makefile.kokkos + echo "" >> Makefile.kokkos + echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos + echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos + echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos + echo "" >> Makefile.kokkos + echo "#Variables used in application Makefiles" >> Makefile.kokkos + echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos + echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos + echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos + echo "KOKKOS_LINK_DEPENDS = $(KOKKOS_LINK_DEPENDS)" >> Makefile.kokkos + echo "KOKKOS_LIBS = $(KOKKOS_LIBS)" >> Makefile.kokkos + echo "KOKKOS_LDFLAGS = $(KOKKOS_LDFLAGS)" >> Makefile.kokkos + sed \ + -e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \ + -e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \ + -e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \ + -e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \ + -e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \ + -e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' Makefile.kokkos \ + > Makefile.kokkos.tmp + mv -f Makefile.kokkos.tmp Makefile.kokkos + +build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS) + +mkdir: + mkdir -p $(PREFIX) + mkdir -p $(PREFIX)/include + mkdir -p $(PREFIX)/lib + mkdir -p $(PREFIX)/include/impl + +copy-cuda: mkdir + mkdir -p $(PREFIX)/include/Cuda + cp $(KOKKOS_HEADERS_CUDA) $(PREFIX)/include/Cuda + +copy-threads: mkdir + mkdir -p $(PREFIX)/include/Threads + cp $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads + +copy-openmp: mkdir + mkdir -p $(PREFIX)/include/OpenMP + cp $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP + +install: mkdir $(CONDITIONAL_COPIES) build-lib + cp $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include + cp $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl + cp Makefile.kokkos $(PREFIX) + cp libkokkos.a $(PREFIX)/lib + cp KokkosCore_config.h $(PREFIX)/include + + + +clean: kokkos-clean + rm Makefile.kokkos + + + + diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp new file mode 100755 index 0000000000000000000000000000000000000000..f8393611e4d10357cd8051e0535a3aa947fd8f99 --- /dev/null +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -0,0 +1,496 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMP_PARALLEL_HPP +#define KOKKOS_OPENMP_PARALLEL_HPP + +#include <omp.h> + +#include <Kokkos_Parallel.hpp> +#include <OpenMP/Kokkos_OpenMPexec.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ; + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value , + const FunctorType & >::type functor + , const PType & range ) + { + const typename PType::member_type work_end = range.end(); + for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) { + functor( iwork ); + } + } + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value , + const FunctorType & >::type functor + , const PType & range ) + { + const typename PType::member_type work_end = range.end(); + for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) { + functor( typename PType::work_tag() , iwork ); + } + } + +public: + + inline + ParallelFor( const FunctorType & functor + , const Policy & policy ) + { + OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for"); + OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for"); + +#pragma omp parallel + { + OpenMPexec & exec = * OpenMPexec::get_thread_omp(); + driver( functor , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() ) ); + } +/* END #pragma omp parallel */ + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ; + typedef typename Policy::work_tag WorkTag ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value , + const FunctorType & >::type functor + , reference_type update + , const PType & range ) + { + const typename PType::member_type work_end = range.end(); + for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) { + functor( iwork , update ); + } + } + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value , + const FunctorType & >::type functor + , reference_type update + , const PType & range ) + { + const typename PType::member_type work_end = range.end(); + for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) { + functor( typename PType::work_tag() , iwork , update ); + } + } + +public: + + //---------------------------------------- + + template< class ViewType > + inline + ParallelReduce( typename Impl::enable_if< + ( Impl::is_view< ViewType >::value && + Impl::is_same< typename ViewType::memory_space , HostSpace >::value + ), const FunctorType & >::type functor + , const Policy & policy + , const ViewType & result_view ) + { + OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce"); + OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce"); + + OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , 0 ); + +#pragma omp parallel + { + OpenMPexec & exec = * OpenMPexec::get_thread_omp(); + + driver( functor + , ValueInit::init( functor , exec.scratch_reduce() ) + , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() ) + ); + } +/* END #pragma omp parallel */ + + { + const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() ); + + for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) { + ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); + } + + Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr ); + + if ( result_view.ptr_on_device() ) { + const int n = ValueTraits::value_count( functor ); + + for ( int j = 0 ; j < n ; ++j ) { result_view.ptr_on_device()[j] = ptr[j] ; } + } + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ; + typedef typename Policy::work_tag WorkTag ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ; + typedef Kokkos::Impl::FunctorValueOps< FunctorType , WorkTag > ValueOps ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value , + const FunctorType & >::type functor + , reference_type update + , const PType & range + , const bool final ) + { + const typename PType::member_type work_end = range.end(); + for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) { + functor( iwork , update , final ); + } + } + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value , + const FunctorType & >::type functor + , reference_type update + , const PType & range + , const bool final ) + { + const typename PType::member_type work_end = range.end(); + for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) { + functor( typename PType::work_tag() , iwork , update , final ); + } + } + +public: + + //---------------------------------------- + + inline + ParallelScan( const FunctorType & functor + , const Policy & policy ) + { + OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan"); + OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan"); + + OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( functor ) , 0 ); + +#pragma omp parallel + { + OpenMPexec & exec = * OpenMPexec::get_thread_omp(); + + driver( functor + , ValueInit::init( functor , pointer_type( exec.scratch_reduce() ) + ValueTraits::value_count( functor ) ) + , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() ) + , false ); + } +/* END #pragma omp parallel */ + + { + const unsigned thread_count = OpenMPexec::pool_size(); + const unsigned value_count = ValueTraits::value_count( functor ); + + pointer_type ptr_prev = 0 ; + + for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) { + + pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() ); + + if ( ptr_prev ) { + for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; } + ValueJoin::join( functor , ptr + value_count , ptr ); + } + else { + ValueInit::init( functor , ptr ); + } + + ptr_prev = ptr ; + } + } + +#pragma omp parallel + { + OpenMPexec & exec = * OpenMPexec::get_thread_omp(); + + driver( functor + , ValueOps::reference( pointer_type( exec.scratch_reduce() ) ) + , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() ) + , true ); + } +/* END #pragma omp parallel */ + + } + + //---------------------------------------- +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class Arg0 , class Arg1 > +class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > > +{ +private: + + typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > Policy ; + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value , + const FunctorType & >::type functor + , const typename Policy::member_type & member ) + { functor( member ); } + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value , + const FunctorType & >::type functor + , const typename Policy::member_type & member ) + { functor( TagType() , member ); } + +public: + + inline + ParallelFor( const FunctorType & functor , + const Policy & policy ) + { + OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for"); + OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for"); + + const size_t team_reduce_size = Policy::member_type::team_reduce_size(); + const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ); + + OpenMPexec::resize_scratch( 0 , team_reduce_size + team_shmem_size ); + +#pragma omp parallel + { + typename Policy::member_type member( * OpenMPexec::get_thread_omp() , policy , team_shmem_size ); + + for ( ; member.valid() ; member.next() ) { + ParallelFor::template driver< typename Policy::work_tag >( functor , member ); + } + } +/* END #pragma omp parallel */ + } + + void wait() {} +}; + + +template< class FunctorType , class Arg0 , class Arg1 > +class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > > +{ +private: + + typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > Policy ; + typedef typename Policy::work_tag WorkTag ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value , + const FunctorType & >::type functor + , const typename PType::member_type & member + , reference_type update ) + { functor( member , update ); } + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value , + const FunctorType & >::type functor + , const typename PType::member_type & member + , reference_type update ) + { functor( typename PType::work_tag() , member , update ); } + +public: + + inline + ParallelReduce( const FunctorType & functor , + const Policy & policy ) + { + OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce"); + + const size_t team_reduce_size = Policy::member_type::team_reduce_size(); + const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ); + + OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size ); + +#pragma omp parallel + { + OpenMPexec & exec = * OpenMPexec::get_thread_omp(); + + reference_type update = ValueInit::init( functor , exec.scratch_reduce() ); + + for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) { + ParallelReduce::template driver< Policy >( functor , member , update ); + } + } +/* END #pragma omp parallel */ + + { + typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag , reference_type > Join ; + + const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() ); + + for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) { + Join::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); + } + + Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr ); + } + } + + template< class ViewType > + inline + ParallelReduce( const FunctorType & functor , + const Policy & policy , + const ViewType & result ) + { + OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce"); + + const size_t team_reduce_size = Policy::member_type::team_reduce_size(); + const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ); + + OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size ); + +#pragma omp parallel + { + OpenMPexec & exec = * OpenMPexec::get_thread_omp(); + + reference_type update = ValueInit::init( functor , exec.scratch_reduce() ); + + for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) { + ParallelReduce::template driver< Policy >( functor , member , update ); + } + } +/* END #pragma omp parallel */ + + { + const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() ); + + for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) { + ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); + } + + Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr ); + + const int n = ValueTraits::value_count( functor ); + + for ( int j = 0 ; j < n ; ++j ) { result.ptr_on_device()[j] = ptr[j] ; } + } + } + + void wait() {} +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* KOKKOS_OPENMP_PARALLEL_HPP */ + diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp new file mode 100755 index 0000000000000000000000000000000000000000..ed98fd2f979af77a70bd4d6b0a44a570be65c40c --- /dev/null +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp @@ -0,0 +1,364 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdio.h> +#include <limits> +#include <iostream> +#include <vector> +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Error.hpp> +#include <iostream> + +#ifdef KOKKOS_HAVE_OPENMP + +namespace Kokkos { +namespace Impl { +namespace { + +KOKKOS_INLINE_FUNCTION +int kokkos_omp_in_parallel(); + +int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 ); + +KOKKOS_INLINE_FUNCTION +int kokkos_omp_in_parallel() +{ +#ifndef __CUDA_ARCH__ + return omp_in_parallel() && ! kokkos_omp_in_critical_region ; +#else + return 0; +#endif +} + +bool s_using_hwloc = false; + +} // namespace +} // namespace Impl +} // namespace Kokkos + + +namespace Kokkos { +namespace Impl { + +int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 }; + +int OpenMPexec::m_pool_topo[ 4 ] = { 0 }; + +OpenMPexec::Pool OpenMPexec::m_pool; + +void OpenMPexec::verify_is_process( const char * const label ) +{ + if ( omp_in_parallel() ) { + std::string msg( label ); + msg.append( " ERROR: in parallel" ); + Kokkos::Impl::throw_runtime_exception( msg ); + } +} + +void OpenMPexec::verify_initialized( const char * const label ) +{ + if ( 0 == m_pool[0] ) { + std::string msg( label ); + msg.append( " ERROR: not initialized" ); + Kokkos::Impl::throw_runtime_exception( msg ); + } +} + +void OpenMPexec::clear_scratch() +{ +#pragma omp parallel + { + const int rank_rev = m_map_rank[ omp_get_thread_num() ]; + m_pool.at(rank_rev).clear(); + } +/* END #pragma omp parallel */ +} + +void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size ) +{ + enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 }; + enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK }; + + const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ; + const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ; + + reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ; + thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ; + + // Requesting allocation and old allocation is too small: + + const bool allocate = ( old_reduce_size < reduce_size ) || + ( old_thread_size < thread_size ); + + if ( allocate ) { + if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; } + if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; } + } + + const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ; + const int pool_size = m_pool_topo[0] ; + + if ( allocate ) { + + clear_scratch(); + +#pragma omp parallel + { + const int rank_rev = m_map_rank[ omp_get_thread_num() ]; + const int rank = pool_size - ( rank_rev + 1 ); + + m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size ); + new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size ); + } +/* END #pragma omp parallel */ + } +} + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +int OpenMP::is_initialized() +{ return 0 != Impl::OpenMPexec::m_pool[0]; } + +void OpenMP::initialize( unsigned thread_count , + unsigned use_numa_count , + unsigned use_cores_per_numa ) +{ + // Before any other call to OMP query the maximum number of threads + // and save the value for re-initialization unit testing. + + //Using omp_get_max_threads(); is problematic in conjunction with + //Hwloc on Intel (essentially an initial call to the OpenMP runtime + //without a parallel region before will set a process mask for a single core + //The runtime will than bind threads for a parallel region to other cores on the + //entering the first parallel region and make the process mask the aggregate of + //the thread masks. The intend seems to be to make serial code run fast, if you + //compile with OpenMP enabled but don't actually use parallel regions or so + //static int omp_max_threads = omp_get_max_threads(); + int nthreads = 0; + #pragma omp parallel + { + #pragma omp atomic + nthreads++; + } + + static int omp_max_threads = nthreads; + + const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ; + + bool thread_spawn_failed = false ; + + if ( ! is_initialized ) { + + // Use hwloc thread pinning if concerned with locality. + // If spreading threads across multiple NUMA regions. + // If hyperthreading is enabled. + Impl::s_using_hwloc = hwloc::available() && ( + ( 1 < Kokkos::hwloc::get_available_numa_count() ) || + ( 1 < Kokkos::hwloc::get_available_threads_per_core() ) ); + + std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ]; + + // If hwloc available then use it's maximum value. + + if ( thread_count == 0 ) { + thread_count = Impl::s_using_hwloc + ? Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core() + : omp_max_threads ; + } + + if(Impl::s_using_hwloc) + hwloc::thread_mapping( "Kokkos::OpenMP::initialize" , + false /* do not allow asynchronous */ , + thread_count , + use_numa_count , + use_cores_per_numa , + threads_coord ); + + // Spawn threads: + + omp_set_num_threads( thread_count ); + + // Verify OMP interaction: + if ( int(thread_count) != omp_get_max_threads() ) { + thread_spawn_failed = true ; + } + + // Verify spawning and bind threads: +#pragma omp parallel + { +#pragma omp critical + { + if ( int(thread_count) != omp_get_num_threads() ) { + thread_spawn_failed = true ; + } + + // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region. + // Call to 'new' may not be thread safe as well. + + // Reverse the rank for threads so that the scan operation reduces to the highest rank thread. + + const unsigned omp_rank = omp_get_thread_num(); + const unsigned thread_r = Impl::s_using_hwloc ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) : omp_rank ; + + Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ; + } +/* END #pragma omp critical */ + } +/* END #pragma omp parallel */ + + if ( ! thread_spawn_failed ) { + Impl::OpenMPexec::m_pool_topo[0] = thread_count ; + Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count; + Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1; + + Impl::OpenMPexec::resize_scratch( 1024 , 1024 ); + } + } + + if ( is_initialized || thread_spawn_failed ) { + std::string msg("Kokkos::OpenMP::initialize ERROR"); + + if ( is_initialized ) { msg.append(" : already initialized"); } + if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); } + + Kokkos::Impl::throw_runtime_exception(msg); + } + + // Init the array for used for arbitrarily sized atomics + Impl::init_lock_array_host_space(); +} + +//---------------------------------------------------------------------------- + +void OpenMP::finalize() +{ + Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" ); + Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" ); + + Impl::OpenMPexec::clear_scratch(); + + Impl::OpenMPexec::m_pool_topo[0] = 0 ; + Impl::OpenMPexec::m_pool_topo[1] = 0 ; + Impl::OpenMPexec::m_pool_topo[2] = 0 ; + + omp_set_num_threads(1); + + if ( Impl::s_using_hwloc ) { + hwloc::unbind_this_thread(); + } +} + +//---------------------------------------------------------------------------- + +void OpenMP::print_configuration( std::ostream & s , const bool detail ) +{ + Impl::OpenMPexec::verify_is_process( "OpenMP::print_configuration" ); + + s << "Kokkos::OpenMP" ; + +#if defined( KOKKOS_HAVE_OPENMP ) + s << " KOKKOS_HAVE_OPENMP" ; +#endif +#if defined( KOKKOS_HAVE_HWLOC ) + + const unsigned numa_count_ = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); + + s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]" + << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" ) + ; +#endif + + const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ; + + if ( is_initialized ) { + const int numa_count = Kokkos::Impl::OpenMPexec::m_pool_topo[0] / Kokkos::Impl::OpenMPexec::m_pool_topo[1] ; + const int core_per_numa = Kokkos::Impl::OpenMPexec::m_pool_topo[1] / Kokkos::Impl::OpenMPexec::m_pool_topo[2] ; + const int thread_per_core = Kokkos::Impl::OpenMPexec::m_pool_topo[2] ; + + s << " thread_pool_topology[ " << numa_count + << " x " << core_per_numa + << " x " << thread_per_core + << " ]" + << std::endl ; + + if ( detail ) { + std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPexec::m_pool_topo[0] ); + +#pragma omp parallel + { +#pragma omp critical + { + coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate(); + } +/* END #pragma omp critical */ + } +/* END #pragma omp parallel */ + + for ( unsigned i = 0 ; i < coord.size() ; ++i ) { + s << " thread omp_rank[" << i << "]" + << " kokkos_rank[" << Impl::OpenMPexec::m_map_rank[ i ] << "]" + << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]" + << std::endl ; + } + } + } + else { + s << " not initialized" << std::endl ; + } +} + +} // namespace Kokkos + +#endif //KOKKOS_HAVE_OPENMP diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp new file mode 100755 index 0000000000000000000000000000000000000000..1ab08f648d42a01f81dfdc3d890d5d06fa974f29 --- /dev/null +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp @@ -0,0 +1,767 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPEXEC_HPP +#define KOKKOS_OPENMPEXEC_HPP + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_spinwait.hpp> +#include <impl/Kokkos_AllocationTracker.hpp> + +#include <Kokkos_Atomic.hpp> + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +/** \brief Data for OpenMP thread execution */ + +class OpenMPexec { +public: + + enum { MAX_THREAD_COUNT = 4096 }; + + struct Pool + { + Pool() : m_trackers() {} + + AllocationTracker m_trackers[ MAX_THREAD_COUNT ]; + + OpenMPexec * operator[](int i) + { + return reinterpret_cast<OpenMPexec *>(m_trackers[i].alloc_ptr()); + } + + AllocationTracker & at(int i) + { + return m_trackers[i]; + } + }; + +private: + + static int m_pool_topo[ 4 ]; + static int m_map_rank[ MAX_THREAD_COUNT ]; + static Pool m_pool; // Indexed by: m_pool_rank_rev + + friend class Kokkos::OpenMP ; + + int const m_pool_rank ; + int const m_pool_rank_rev ; + int const m_scratch_exec_end ; + int const m_scratch_reduce_end ; + int const m_scratch_thread_end ; + + int volatile m_barrier_state ; + + OpenMPexec(); + OpenMPexec( const OpenMPexec & ); + OpenMPexec & operator = ( const OpenMPexec & ); + + static void clear_scratch(); + +public: + + // Topology of a cache coherent thread pool: + // TOTAL = NUMA x GRAIN + // pool_size( depth = 0 ) + // pool_size(0) = total number of threads + // pool_size(1) = number of threads per NUMA + // pool_size(2) = number of threads sharing finest grain memory hierarchy + + inline static + int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; } + + inline static + OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; } + + inline int pool_rank() const { return m_pool_rank ; } + inline int pool_rank_rev() const { return m_pool_rank_rev ; } + + inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; } + inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; } + + inline + void state_wait( int state ) + { Impl::spinwait( m_barrier_state , state ); } + + inline + void state_set( int state ) { m_barrier_state = state ; } + + ~OpenMPexec() {} + + OpenMPexec( const int poolRank + , const int scratch_exec_size + , const int scratch_reduce_size + , const int scratch_thread_size ) + : m_pool_rank( poolRank ) + , m_pool_rank_rev( pool_size() - ( poolRank + 1 ) ) + , m_scratch_exec_end( scratch_exec_size ) + , m_scratch_reduce_end( m_scratch_exec_end + scratch_reduce_size ) + , m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size ) + , m_barrier_state(0) + {} + + static void finalize(); + + static void initialize( const unsigned team_count , + const unsigned threads_per_team , + const unsigned numa_count , + const unsigned cores_per_numa ); + + static void verify_is_process( const char * const ); + static void verify_initialized( const char * const ); + + static void resize_scratch( size_t reduce_size , size_t thread_size ); + + inline static + OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +class OpenMPexecTeamMember { +private: + + enum { TEAM_REDUCE_SIZE = 512 }; + + /** \brief Thread states for team synchronization */ + enum { Active = 0 , Rendezvous = 1 }; + + typedef Kokkos::OpenMP execution_space ; + typedef execution_space::scratch_memory_space scratch_memory_space ; + + Impl::OpenMPexec & m_exec ; + scratch_memory_space m_team_shared ; + int m_team_shmem ; + int m_team_base_rev ; + int m_team_rank_rev ; + int m_team_rank ; + int m_team_size ; + int m_league_rank ; + int m_league_end ; + int m_league_size ; + + // Fan-in team threads, root of the fan-in which does not block returns true + inline + bool team_fan_in() const + { + for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) { + m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active ); + } + + if ( m_team_rank_rev ) { + m_exec.state_set( Rendezvous ); + m_exec.state_wait( Rendezvous ); + } + + return 0 == m_team_rank_rev ; + } + + inline + void team_fan_out() const + { + for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) { + m_exec.pool_rev( m_team_base_rev + j )->state_set( Active ); + } + } + +public: + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space & team_shmem() const + { return m_team_shared ; } + + KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; } + KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; } + KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; } + KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; } + + KOKKOS_INLINE_FUNCTION void team_barrier() const +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + {} +#else + { + if ( 1 < m_team_size ) { + team_fan_in(); + team_fan_out(); + } + } +#endif + + template<class ValueType> + KOKKOS_INLINE_FUNCTION + void team_broadcast(ValueType& value, const int& thread_id) const + { +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { } +#else + // Make sure there is enough scratch space: + typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE + , ValueType , void >::type type ; + + type * const local_value = ((type*) m_exec.scratch_thread()); + if(team_rank() == thread_id) + *local_value = value; + memory_fence(); + team_barrier(); + value = *local_value; +#endif + } + +#ifdef KOKKOS_HAVE_CXX11 + template< class ValueType, class JoinOp > + KOKKOS_INLINE_FUNCTION ValueType + team_reduce( const ValueType & value + , const JoinOp & op_in ) const + #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return ValueType(); } + #else + { + typedef ValueType value_type; + const JoinLambdaAdapter<value_type,JoinOp> op(op_in); + #endif +#else // KOKKOS_HAVE_CXX11 + template< class JoinOp > + KOKKOS_INLINE_FUNCTION typename JoinOp::value_type + team_reduce( const typename JoinOp::value_type & value + , const JoinOp & op ) const + #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return typename JoinOp::value_type(); } + #else + { + typedef typename JoinOp::value_type value_type; + #endif +#endif // KOKKOS_HAVE_CXX11 +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + // Make sure there is enough scratch space: + typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE + , value_type , void >::type type ; + + type * const local_value = ((type*) m_exec.scratch_thread()); + + // Set this thread's contribution + *local_value = value ; + + // Fence to make sure the base team member has access: + memory_fence(); + + if ( team_fan_in() ) { + // The last thread to synchronize returns true, all other threads wait for team_fan_out() + type * const team_value = ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread()); + + // Join to the team value: + for ( int i = 1 ; i < m_team_size ; ++i ) { + op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) ); + } + + // The base team member may "lap" the other team members, + // copy to their local value before proceeding. + for ( int i = 1 ; i < m_team_size ; ++i ) { + *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) = *team_value ; + } + + // Fence to make sure all team members have access + memory_fence(); + } + + team_fan_out(); + + return *((type volatile const *)local_value); + } +#endif + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template< typename ArgType > + KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return ArgType(); } +#else + { + // Make sure there is enough scratch space: + typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ; + + volatile type * const work_value = ((type*) m_exec.scratch_thread()); + + *work_value = value ; + + memory_fence(); + + if ( team_fan_in() ) { + // The last thread to synchronize returns true, all other threads wait for team_fan_out() + // m_team_base[0] == highest ranking team member + // m_team_base[ m_team_size - 1 ] == lowest ranking team member + // + // 1) copy from lower to higher rank, initialize lowest rank to zero + // 2) prefix sum from lowest to highest rank, skipping lowest rank + + type accum = 0 ; + + if ( global_accum ) { + for ( int i = m_team_size ; i-- ; ) { + type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()); + accum += val ; + } + accum = atomic_fetch_add( global_accum , accum ); + } + + for ( int i = m_team_size ; i-- ; ) { + type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()); + const type offset = accum ; + accum += val ; + val = offset ; + } + + memory_fence(); + } + + team_fan_out(); + + return *work_value ; + } +#endif + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template< typename Type > + KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const + { return this-> template team_scan<Type>( value , 0 ); } + + //---------------------------------------- + // Private for the driver + +private: + + typedef execution_space::scratch_memory_space space ; + +public: + + template< class Arg0 , class Arg1 > + inline + OpenMPexecTeamMember( Impl::OpenMPexec & exec + , const TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > & team + , const int shmem_size + ) + : m_exec( exec ) + , m_team_shared(0,0) + , m_team_shmem( shmem_size ) + , m_team_base_rev(0) + , m_team_rank_rev(0) + , m_team_rank(0) + , m_team_size( team.team_size() ) + , m_league_rank(0) + , m_league_end(0) + , m_league_size( team.league_size() ) + { + const int pool_rank_rev = m_exec.pool_rank_rev(); + const int pool_team_rank_rev = pool_rank_rev % team.team_alloc(); + const int pool_league_rank_rev = pool_rank_rev / team.team_alloc(); + const int league_iter_end = team.league_size() - pool_league_rank_rev * team.team_iter(); + + if ( pool_team_rank_rev < m_team_size && 0 < league_iter_end ) { + m_team_base_rev = team.team_alloc() * pool_league_rank_rev ; + m_team_rank_rev = pool_team_rank_rev ; + m_team_rank = m_team_size - ( m_team_rank_rev + 1 ); + m_league_end = league_iter_end ; + m_league_rank = league_iter_end > team.team_iter() ? league_iter_end - team.team_iter() : 0 ; + new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem ); + } + } + + bool valid() const + { return m_league_rank < m_league_end ; } + + void next() + { + if ( ++m_league_rank < m_league_end ) { + team_barrier(); + new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem ); + } + } + + static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; } +}; + + + +} // namespace Impl + +template< class Arg0 , class Arg1 > +class TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > +{ +public: + + //! Tag this class as a kokkos execution policy + typedef TeamPolicy execution_policy ; + + //! Execution space of this execution policy. + typedef Kokkos::OpenMP execution_space ; + + typedef typename + Impl::if_c< ! Impl::is_same< Kokkos::OpenMP , Arg0 >::value , Arg0 , Arg1 >::type + work_tag ; + + //---------------------------------------- + + template< class FunctorType > + inline static + int team_size_max( const FunctorType & ) + { return execution_space::thread_pool_size(1); } + + template< class FunctorType > + inline static + int team_size_recommended( const FunctorType & ) + { return execution_space::thread_pool_size(2); } + + template< class FunctorType > + inline static + int team_size_recommended( const FunctorType &, const int& ) + { return execution_space::thread_pool_size(2); } + + //---------------------------------------- + +private: + + int m_league_size ; + int m_team_size ; + int m_team_alloc ; + int m_team_iter ; + + inline void init( const int league_size_request + , const int team_size_request ) + { + const int pool_size = execution_space::thread_pool_size(0); + const int team_max = execution_space::thread_pool_size(1); + const int team_grain = execution_space::thread_pool_size(2); + + m_league_size = league_size_request ; + + m_team_size = team_size_request < team_max ? + team_size_request : team_max ; + + // Round team size up to a multiple of 'team_gain' + const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain ); + const int team_count = pool_size / team_size_grain ; + + // Constraint : pool_size = m_team_alloc * team_count + m_team_alloc = pool_size / team_count ; + + // Maxumum number of iterations each team will take: + m_team_iter = ( m_league_size + team_count - 1 ) / team_count ; + } + +public: + + inline int team_size() const { return m_team_size ; } + inline int league_size() const { return m_league_size ; } + + /** \brief Specify league size, request team size */ + TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1) + { init( league_size_request , team_size_request ); (void) vector_length_request; } + + TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 ) + { init( league_size_request , team_size_request ); (void) vector_length_request; } + + inline int team_alloc() const { return m_team_alloc ; } + inline int team_iter() const { return m_team_iter ; } + + typedef Impl::OpenMPexecTeamMember member_type ; +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +inline +int OpenMP::thread_pool_size( int depth ) +{ + return Impl::OpenMPexec::pool_size(depth); +} + +KOKKOS_INLINE_FUNCTION +int OpenMP::thread_pool_rank() +{ +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return Impl::OpenMPexec::m_map_rank[ omp_get_thread_num() ]; +#else + return -1 ; +#endif +} + +} // namespace Kokkos + + +namespace Kokkos { + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember> + TeamThreadRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) { + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>(thread,count); +} + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember> + TeamThreadRange(const Impl::OpenMPexecTeamMember& thread, const iType& begin, const iType& end) { + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>(thread,begin,end); +} + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember > + ThreadVectorRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) { + return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >(thread,count); +} + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember> PerTeam(const Impl::OpenMPexecTeamMember& thread) { + return Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember> PerThread(const Impl::OpenMPexecTeamMember& thread) { + return Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>(thread); +} +} // namespace Kokkos + +namespace Kokkos { + + /** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + * This functionality requires C++11 support.*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, const Lambda& lambda) { + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i); +} + +/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of + * val is performed and put into result. This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, + const Lambda & lambda, ValueType& result) { + + result = ValueType(); + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + result+=tmp; + } + + result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>()); +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of + * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result. + * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore + * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or + * '1 for *'). This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, + const Lambda & lambda, const JoinType& join, ValueType& init_result) { + + ValueType result = init_result; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + join(result,tmp); + } + + init_result = loop_boundaries.thread.team_reduce(result,join); +} + +} //namespace Kokkos + + +namespace Kokkos { +/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. + * This functionality requires C++11 support.*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >& + loop_boundaries, const Lambda& lambda) { + #ifdef KOKKOS_HAVE_PRAGMA_IVDEP + #pragma ivdep + #endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i); +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of + * val is performed and put into result. This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >& + loop_boundaries, const Lambda & lambda, ValueType& result) { + result = ValueType(); +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + result+=tmp; + } +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of + * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result. + * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore + * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or + * '1 for *'). This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >& + loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) { + + ValueType result = init_result; +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + join(result,tmp); + } + init_result = result; +} + +/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final) + * for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed. + * Depending on the target execution space the operator might be called twice: once with final=false + * and once with final=true. When final==true val contains the prefix sum value. The contribution of this + * "i" needs to be added to val no matter whether final==true or not. In a serial execution + * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set + * to the final sum value over all vector lanes. + * This functionality requires C++11 support.*/ +template< typename iType, class FunctorType > +KOKKOS_INLINE_FUNCTION +void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >& + loop_boundaries, const FunctorType & lambda) { + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef typename ValueTraits::value_type value_type ; + + value_type scan_val = value_type(); + +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,scan_val,true); + } +} + +} // namespace Kokkos + +namespace Kokkos { + +template<class FunctorType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) { + lambda(); +} + +template<class FunctorType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) { + if(single_struct.team_member.team_rank()==0) lambda(); +} + +template<class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) { + lambda(val); +} + +template<class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) { + if(single_struct.team_member.team_rank()==0) { + lambda(val); + } + single_struct.team_member.team_broadcast(val,0); +} +} + +#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */ + diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp new file mode 100755 index 0000000000000000000000000000000000000000..d8b40943deb6264f96a787bdd661534ca1372c8d --- /dev/null +++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp @@ -0,0 +1,484 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core_fwd.hpp> + +#if defined( KOKKOS_HAVE_QTHREAD ) + +#include <stdio.h> +#include <stdlib.h> +#include <iostream> +#include <sstream> +#include <utility> +#include <Kokkos_Qthread.hpp> +#include <Kokkos_Atomic.hpp> +#include <impl/Kokkos_Error.hpp> + +// Defines to enable experimental Qthread functionality + +#define QTHREAD_LOCAL_PRIORITY +#define CLONED_TASKS + +#include <qthread/qthread.h> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { +namespace { + +enum { MAXIMUM_QTHREAD_WORKERS = 1024 }; + +/** s_exec is indexed by the reverse rank of the workers + * for faster fan-in / fan-out lookups + * [ n - 1 , n - 2 , ... , 0 ] + */ +QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ]; + +int s_number_shepherds = 0 ; +int s_number_workers_per_shepherd = 0 ; +int s_number_workers = 0 ; + +inline +QthreadExec ** worker_exec() +{ + return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 ); +} + +const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) ); + +int s_worker_reduce_end = 0 ; /* End of worker reduction memory */ +int s_worker_shared_end = 0 ; /* Total of worker scratch memory */ +int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */ + +QthreadExecFunctionPointer volatile s_active_function = 0 ; +const void * volatile s_active_function_arg = 0 ; + +} /* namespace */ +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +void Qthread::initialize( int thread_count ) +{ + // Environment variable: QTHREAD_NUM_SHEPHERDS + // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP + // Environment variable: QTHREAD_HWPAR + + { + char buffer[256]; + snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count); + putenv(buffer); + } + + const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) && + ( thread_count == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) && + ( thread_count == qthread_num_workers() ); + + bool ok_symmetry = true ; + + if ( ok_init ) { + Impl::s_number_shepherds = qthread_num_shepherds(); + Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD); + Impl::s_number_workers = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ; + + for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) { + ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) ); + } + } + + if ( ! ok_init || ! ok_symmetry ) { + std::ostringstream msg ; + + msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ; + msg << " : qthread_num_shepherds = " << qthread_num_shepherds(); + msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD); + msg << " : qthread_num_workers = " << qthread_num_workers(); + + if ( ! ok_symmetry ) { + msg << " : qthread_num_workers_local = {" ; + for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) { + msg << " " << qthread_num_workers_local(i) ; + } + msg << " }" ; + } + + Impl::s_number_workers = 0 ; + Impl::s_number_shepherds = 0 ; + Impl::s_number_workers_per_shepherd = 0 ; + + if ( ok_init ) { qthread_finalize(); } + + Kokkos::Impl::throw_runtime_exception( msg.str() ); + } + + Impl::QthreadExec::resize_worker_scratch( 256 , 256 ); + + // Init the array for used for arbitrarily sized atomics + Impl::init_lock_array_host_space(); + +} + +void Qthread::finalize() +{ + Impl::QthreadExec::clear_workers(); + + if ( Impl::s_number_workers ) { + qthread_finalize(); + } + + Impl::s_number_workers = 0 ; + Impl::s_number_shepherds = 0 ; + Impl::s_number_workers_per_shepherd = 0 ; +} + +void Qthread::print_configuration( std::ostream & s , const bool detail ) +{ + s << "Kokkos::Qthread {" + << " num_shepherds(" << Impl::s_number_shepherds << ")" + << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")" + << " }" << std::endl ; +} + +Qthread & Qthread::instance( int ) +{ + static Qthread q ; + return q ; +} + +void Qthread::fence() +{ +} + +int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; } +int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; } + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { +namespace { + +aligned_t driver_exec_all( void * arg ) +{ + QthreadExec & exec = **worker_exec(); + + (*s_active_function)( exec , s_active_function_arg ); + +/* + fprintf( stdout + , "QthreadExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n" + , exec.worker_rank() + , exec.worker_size() + , exec.shepherd_rank() + , exec.shepherd_size() + , exec.shepherd_worker_rank() + , exec.shepherd_worker_size() + ); + fflush(stdout); +*/ + + return 0 ; +} + +aligned_t driver_resize_worker_scratch( void * arg ) +{ + static volatile int lock_begin = 0 ; + static volatile int lock_end = 0 ; + + QthreadExec ** const exec = worker_exec(); + + //---------------------------------------- + // Serialize allocation for thread safety + + while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock + + const bool ok = 0 == *exec ; + + if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); } + + lock_begin = 0 ; // release lock + + if ( ok ) { new( *exec ) QthreadExec(); } + + //---------------------------------------- + // Wait for all calls to complete to insure that each worker has executed. + + if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; } + + while ( lock_end ); + +/* + fprintf( stdout + , "QthreadExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n" + , (**exec).worker_rank() + , (**exec).worker_size() + , (**exec).shepherd_rank() + , (**exec).shepherd_size() + , (**exec).shepherd_worker_rank() + , (**exec).shepherd_worker_size() + ); + fflush(stdout); +*/ + + //---------------------------------------- + + if ( ! ok ) { + fprintf( stderr , "Kokkos::QthreadExec resize failed\n" ); + fflush( stderr ); + } + + return 0 ; +} + +void verify_is_process( const char * const label , bool not_active = false ) +{ + const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL); + const bool is_active = not_active && ( s_active_function || s_active_function_arg ); + + if ( not_process || is_active ) { + std::string msg( label ); + msg.append( " : FAILED" ); + if ( not_process ) msg.append(" : not called by main process"); + if ( is_active ) msg.append(" : parallel execution in progress"); + Kokkos::Impl::throw_runtime_exception( msg ); + } +} + +} + +int QthreadExec::worker_per_shepherd() +{ + return s_number_workers_per_shepherd ; +} + +QthreadExec::QthreadExec() +{ + const int shepherd_rank = qthread_shep(); + const int shepherd_worker_rank = qthread_worker_local(NULL); + const int worker_rank = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ; + + m_worker_base = s_exec ; + m_shepherd_base = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) ); + m_scratch_alloc = ( (unsigned char *) this ) + s_base_size ; + m_reduce_end = s_worker_reduce_end ; + m_shepherd_rank = shepherd_rank ; + m_shepherd_size = s_number_shepherds ; + m_shepherd_worker_rank = shepherd_worker_rank ; + m_shepherd_worker_size = s_number_workers_per_shepherd ; + m_worker_rank = worker_rank ; + m_worker_size = s_number_workers ; + m_worker_state = QthreadExec::Active ; +} + +void QthreadExec::clear_workers() +{ + for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) { + QthreadExec * const exec = s_exec[iwork] ; + s_exec[iwork] = 0 ; + free( exec ); + } +} + +void QthreadExec::shared_reset( Qthread::scratch_memory_space & space ) +{ + new( & space ) + Qthread::scratch_memory_space( + ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin , + s_worker_shared_end - s_worker_shared_begin + ); +} + +void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size ) +{ + const int exec_all_reduce_alloc = align_alloc( reduce_size ); + const int shepherd_scan_alloc = align_alloc( 8 ); + const int shepherd_shared_end = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size ); + + if ( s_worker_reduce_end < exec_all_reduce_alloc || + s_worker_shared_end < shepherd_shared_end ) { + +/* + fprintf( stdout , "QthreadExec::resize\n"); + fflush(stdout); +*/ + + // Clear current worker memory before allocating new worker memory + clear_workers(); + + // Increase the buffers to an aligned allocation + s_worker_reduce_end = exec_all_reduce_alloc ; + s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ; + s_worker_shared_end = shepherd_shared_end ; + + // Need to query which shepherd this main 'process' is running... + + const int main_shep = qthread_shep(); + + // Have each worker resize its memory for proper first-touch +#if 0 + for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) { + for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) { + qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep ); + }} +#else + // If this function is used before the 'qthread.task_policy' unit test + // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so. + for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) { + const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ; + + if ( num_clone ) { + const int ret = qthread_fork_clones_to_local_priority + ( driver_resize_worker_scratch /* function */ + , NULL /* function data block */ + , NULL /* pointer to return value feb */ + , jshep /* shepherd number */ + , num_clone - 1 /* number of instances - 1 */ + ); + + assert(ret == QTHREAD_SUCCESS); + } + } +#endif + + driver_resize_worker_scratch( NULL ); + + // Verify all workers allocated + + bool ok = true ; + for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; } + + if ( ! ok ) { + std::ostringstream msg ; + msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ; + for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) { + if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); } + } + msg << " }" ; + Kokkos::Impl::throw_runtime_exception( msg.str() ); + } + } +} + +void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg ) +{ + verify_is_process("QthreadExec::exec_all(...)",true); + +/* + fprintf( stdout , "QthreadExec::exec_all\n"); + fflush(stdout); +*/ + + s_active_function = func ; + s_active_function_arg = arg ; + + // Need to query which shepherd this main 'process' is running... + + const int main_shep = qthread_shep(); + +#if 0 + for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) { + for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) { + qthread_fork_to( driver_exec_all , NULL , NULL , jshep ); + }} +#else + // If this function is used before the 'qthread.task_policy' unit test + // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so. + for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) { + const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ; + + if ( num_clone ) { + const int ret = qthread_fork_clones_to_local_priority + ( driver_exec_all /* function */ + , NULL /* function data block */ + , NULL /* pointer to return value feb */ + , jshep /* shepherd number */ + , num_clone - 1 /* number of instances - 1 */ + ); + + assert(ret == QTHREAD_SUCCESS); + } + } +#endif + + driver_exec_all( NULL ); + + s_active_function = 0 ; + s_active_function_arg = 0 ; +} + +void * QthreadExec::exec_all_reduce_result() +{ + return s_exec[0]->m_scratch_alloc ; +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +namespace Kokkos { +namespace Impl { + +QthreadTeamPolicyMember::QthreadTeamPolicyMember() + : m_exec( **worker_exec() ) + , m_team_shared(0,0) + , m_team_size( 1 ) // s_number_workers_per_shepherd ) + , m_team_rank( 0 ) // m_exec.shepherd_worker_rank() ) + , m_league_size(1) + , m_league_end(1) + , m_league_rank(0) +{ + m_exec.shared_reset( m_team_shared ); +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */ + diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp new file mode 100755 index 0000000000000000000000000000000000000000..365883685772e89d8d32f9dfbfe79d34c746a9aa --- /dev/null +++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp @@ -0,0 +1,614 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_QTHREADEXEC_HPP +#define KOKKOS_QTHREADEXEC_HPP + +#include <impl/Kokkos_spinwait.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- + +class QthreadExec ; + +typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * ); + +class QthreadExec { +private: + + enum { Inactive = 0 , Active = 1 }; + + const QthreadExec * const * m_worker_base ; + const QthreadExec * const * m_shepherd_base ; + + void * m_scratch_alloc ; ///< Scratch memory [ reduce , team , shared ] + int m_reduce_end ; ///< End of scratch reduction memory + + int m_shepherd_rank ; + int m_shepherd_size ; + + int m_shepherd_worker_rank ; + int m_shepherd_worker_size ; + + /* + * m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank + * m_worker_size = m_shepherd_size * m_shepherd_worker_size + */ + int m_worker_rank ; + int m_worker_size ; + + int mutable volatile m_worker_state ; + + + friend class Kokkos::Qthread ; + + ~QthreadExec(); + QthreadExec( const QthreadExec & ); + QthreadExec & operator = ( const QthreadExec & ); + +public: + + QthreadExec(); + + /** Execute the input function on all available Qthread workers */ + static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * ); + + //---------------------------------------- + /** Barrier across all workers participating in the 'exec_all' */ + void exec_all_barrier() const + { + const int rev_rank = m_worker_size - ( m_worker_rank + 1 ); + + int n , j ; + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) { + Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active ); + } + + if ( rev_rank ) { + m_worker_state = QthreadExec::Inactive ; + Impl::spinwait( m_worker_state , QthreadExec::Inactive ); + } + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) { + m_worker_base[j]->m_worker_state = QthreadExec::Active ; + } + } + + /** Barrier across workers within the shepherd with rank < team_rank */ + void shepherd_barrier( const int team_size ) const + { + if ( m_shepherd_worker_rank < team_size ) { + + const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 ); + + int n , j ; + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { + Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active ); + } + + if ( rev_rank ) { + m_worker_state = QthreadExec::Inactive ; + Impl::spinwait( m_worker_state , QthreadExec::Inactive ); + } + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { + m_shepherd_base[j]->m_worker_state = QthreadExec::Active ; + } + } + } + + //---------------------------------------- + /** Reduce across all workers participating in the 'exec_all' */ + template< class FunctorType , class ArgTag > + inline + void exec_all_reduce( const FunctorType & func ) const + { + typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ; + + const int rev_rank = m_worker_size - ( m_worker_rank + 1 ); + + int n , j ; + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) { + const QthreadExec & fan = *m_worker_base[j]; + + Impl::spinwait( fan.m_worker_state , QthreadExec::Active ); + + ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc ); + } + + if ( rev_rank ) { + m_worker_state = QthreadExec::Inactive ; + Impl::spinwait( m_worker_state , QthreadExec::Inactive ); + } + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) { + m_worker_base[j]->m_worker_state = QthreadExec::Active ; + } + } + + //---------------------------------------- + /** Scall across all workers participating in the 'exec_all' */ + template< class FunctorType , class ArgTag > + inline + void exec_all_scan( const FunctorType & func ) const + { + typedef Kokkos::Impl::FunctorValueInit< FunctorType , ArgTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ; + typedef Kokkos::Impl::FunctorValueOps< FunctorType , ArgTag > ValueOps ; + + const int rev_rank = m_worker_size - ( m_worker_rank + 1 ); + + int n , j ; + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) { + Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active ); + } + + if ( rev_rank ) { + m_worker_state = QthreadExec::Inactive ; + Impl::spinwait( m_worker_state , QthreadExec::Inactive ); + } + else { + // Root thread scans across values before releasing threads + // Worker data is in reverse order, so m_worker_base[0] is the + // highest ranking thread. + + // Copy from lower ranking to higher ranking worker. + for ( int i = 1 ; i < m_worker_size ; ++i ) { + ValueOps::copy( func + , m_worker_base[i-1]->m_scratch_alloc + , m_worker_base[i]->m_scratch_alloc + ); + } + + ValueInit::init( func , m_worker_base[m_worker_size-1]->m_scratch_alloc ); + + // Join from lower ranking to higher ranking worker. + // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2]. + for ( int i = m_worker_size - 1 ; --i ; ) { + ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc ); + } + } + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) { + m_worker_base[j]->m_worker_state = QthreadExec::Active ; + } + } + + //---------------------------------------- + + template< class Type> + inline + volatile Type * shepherd_team_scratch_value() const + { return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); } + + template< class Type > + inline + void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const + { + if ( m_shepherd_base ) { + Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>(); + if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; } + memory_fence(); + shepherd_barrier( team_size ); + value = *shared_value ; + } + } + + template< class Type > + inline + Type shepherd_reduce( const int team_size , const Type & value ) const + { + *shepherd_team_scratch_value<Type>() = value ; + + memory_fence(); + + const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 ); + + int n , j ; + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { + Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active ); + } + + if ( rev_rank ) { + m_worker_state = QthreadExec::Inactive ; + Impl::spinwait( m_worker_state , QthreadExec::Inactive ); + } + else { + Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>(); + for ( int i = 1 ; i < n ; ++i ) { + accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>(); + } + for ( int i = 1 ; i < n ; ++i ) { + * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ; + } + + memory_fence(); + } + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { + m_shepherd_base[j]->m_worker_state = QthreadExec::Active ; + } + + return *shepherd_team_scratch_value<Type>(); + } + + template< class JoinOp > + inline + typename JoinOp::value_type + shepherd_reduce( const int team_size + , const typename JoinOp::value_type & value + , const JoinOp & op ) const + { + typedef typename JoinOp::value_type Type ; + + *shepherd_team_scratch_value<Type>() = value ; + + memory_fence(); + + const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 ); + + int n , j ; + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { + Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active ); + } + + if ( rev_rank ) { + m_worker_state = QthreadExec::Inactive ; + Impl::spinwait( m_worker_state , QthreadExec::Inactive ); + } + else { + volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>(); + for ( int i = 1 ; i < team_size ; ++i ) { + op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() ); + } + for ( int i = 1 ; i < team_size ; ++i ) { + * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ; + } + + memory_fence(); + } + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { + m_shepherd_base[j]->m_worker_state = QthreadExec::Active ; + } + + return *shepherd_team_scratch_value<Type>(); + } + + template< class Type > + inline + Type shepherd_scan( const int team_size + , const Type & value + , Type * const global_value = 0 ) const + { + *shepherd_team_scratch_value<Type>() = value ; + + memory_fence(); + + const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 ); + + int n , j ; + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { + Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active ); + } + + if ( rev_rank ) { + m_worker_state = QthreadExec::Inactive ; + Impl::spinwait( m_worker_state , QthreadExec::Inactive ); + } + else { + // Root thread scans across values before releasing threads + // Worker data is in reverse order, so m_shepherd_base[0] is the + // highest ranking thread. + + // Copy from lower ranking to higher ranking worker. + + Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>(); + for ( int i = 1 ; i < team_size ; ++i ) { + const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>(); + accum += tmp ; + * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ; + } + + * m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() = + global_value ? atomic_fetch_add( global_value , accum ) : 0 ; + + // Join from lower ranking to higher ranking worker. + for ( int i = team_size ; --i ; ) { + * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>(); + } + + memory_fence(); + } + + for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { + m_shepherd_base[j]->m_worker_state = QthreadExec::Active ; + } + + return *shepherd_team_scratch_value<Type>(); + } + + //---------------------------------------- + + static inline + int align_alloc( int size ) + { + enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */}; + enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 }; + return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ; + } + + void shared_reset( Qthread::scratch_memory_space & ); + + void * exec_all_reduce_value() const { return m_scratch_alloc ; } + + static void * exec_all_reduce_result(); + + static void resize_worker_scratch( const int reduce_size , const int shared_size ); + static void clear_workers(); + + //---------------------------------------- + + inline int worker_rank() const { return m_worker_rank ; } + inline int worker_size() const { return m_worker_size ; } + inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; } + inline int shepherd_worker_size() const { return m_shepherd_worker_size ; } + inline int shepherd_rank() const { return m_shepherd_rank ; } + inline int shepherd_size() const { return m_shepherd_size ; } + + static int worker_per_shepherd(); +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +class QthreadTeamPolicyMember { +private: + + typedef Kokkos::Qthread execution_space ; + typedef execution_space::scratch_memory_space scratch_memory_space ; + + + Impl::QthreadExec & m_exec ; + scratch_memory_space m_team_shared ; + const int m_team_size ; + const int m_team_rank ; + const int m_league_size ; + const int m_league_end ; + int m_league_rank ; + +public: + + KOKKOS_INLINE_FUNCTION + const scratch_memory_space & team_shmem() const { return m_team_shared ; } + + KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; } + KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; } + KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; } + KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; } + + KOKKOS_INLINE_FUNCTION void team_barrier() const +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + {} +#else + { m_exec.shepherd_barrier( m_team_size ); } +#endif + + template< typename Type > + KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value , int rank ) const +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return Type(); } +#else + { return m_exec.template shepherd_broadcast<Type>( value , m_team_size , rank ); } +#endif + + template< typename Type > + KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return Type(); } +#else + { return m_exec.template shepherd_reduce<Type>( m_team_size , value ); } +#endif + + template< typename JoinOp > + KOKKOS_INLINE_FUNCTION typename JoinOp::value_type + team_reduce( const typename JoinOp::value_type & value + , const JoinOp & op ) const +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return typename JoinOp::value_type(); } +#else + { return m_exec.template shepherd_reduce<JoinOp>( m_team_size , value , op ); } +#endif + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template< typename Type > + KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return Type(); } +#else + { return m_exec.template shepherd_scan<Type>( m_team_size , value ); } +#endif + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template< typename Type > + KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return Type(); } +#else + { return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); } +#endif + + //---------------------------------------- + // Private driver for task-team parallel + + QthreadTeamPolicyMember(); + + //---------------------------------------- + // Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... } + + // Initialize + template< class Arg0 , class Arg1 > + QthreadTeamPolicyMember( Impl::QthreadExec & exec , const TeamPolicy<Arg0,Arg1,Qthread> & team ) + : m_exec( exec ) + , m_team_shared(0,0) + , m_team_size( team.m_team_size ) + , m_team_rank( exec.shepherd_worker_rank() ) + , m_league_size( team.m_league_size ) + , m_league_end( team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) ) + , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 ) + { + m_exec.shared_reset( m_team_shared ); + } + + // Continue + operator bool () const { return m_league_rank < m_league_end ; } + + // iterate + void next_team() { ++m_league_rank ; m_exec.shared_reset( m_team_shared ); } +}; + +} // namespace Impl + +template< class Arg0 , class Arg1 > +class TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > +{ +private: + + const int m_league_size ; + const int m_team_size ; + const int m_shepherd_iter ; + +public: + + //! Tag this class as a kokkos execution policy + typedef TeamPolicy execution_policy ; + typedef Qthread execution_space ; + + typedef typename + Impl::if_c< ! Impl::is_same< Kokkos::Qthread , Arg0 >::value , Arg0 , Arg1 >::type + work_tag ; + + //---------------------------------------- + + template< class FunctorType > + inline static + int team_size_max( const FunctorType & ) + { return Qthread::instance().shepherd_worker_size(); } + + template< class FunctorType > + static int team_size_recommended( const FunctorType & f ) + { return team_size_max( f ); } + + template< class FunctorType > + inline static + int team_size_recommended( const FunctorType & f , const int& ) + { return team_size_max( f ); } + + //---------------------------------------- + + inline int team_size() const { return m_team_size ; } + inline int league_size() const { return m_league_size ; } + + // One active team per shepherd + TeamPolicy( Kokkos::Qthread & q + , const int league_size + , const int team_size + ) + : m_league_size( league_size ) + , m_team_size( team_size < q.shepherd_worker_size() + ? team_size : q.shepherd_worker_size() ) + , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() ) + { + } + + // One active team per shepherd + TeamPolicy( const int league_size + , const int team_size + ) + : m_league_size( league_size ) + , m_team_size( team_size < Qthread::instance().shepherd_worker_size() + ? team_size : Qthread::instance().shepherd_worker_size() ) + , m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() ) + { + } + + typedef Impl::QthreadTeamPolicyMember member_type ; + + friend class Impl::QthreadTeamPolicyMember ; +}; + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #define KOKKOS_QTHREADEXEC_HPP */ + diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp new file mode 100755 index 0000000000000000000000000000000000000000..dc76a0c42633ad576997f0747b2b934d408d3b70 --- /dev/null +++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp @@ -0,0 +1,643 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_QTHREAD_PARALLEL_HPP +#define KOKKOS_QTHREAD_PARALLEL_HPP + +#include <vector> + +#include <Kokkos_Parallel.hpp> + +#include <impl/Kokkos_StaticAssert.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> + +#include <Qthread/Kokkos_QthreadExec.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ; + + const FunctorType m_func ; + const Policy m_policy ; + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( i ); + } + } + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( ! Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( typename PType::work_tag() , i ); + } + } + + // Function is called once by every concurrent thread. + static void execute( QthreadExec & exec , const void * arg ) + { + const ParallelFor & self = * ((const ParallelFor *) arg ); + + driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() ) ); + + // All threads wait for completion. + exec.exec_all_barrier(); + } + +public: + + ParallelFor( const FunctorType & functor + , const Policy & policy + ) + : m_func( functor ) + , m_policy( policy ) + { + Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this ); + } +}; + +//---------------------------------------------------------------------------- + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + const FunctorType m_func ; + const Policy m_policy ; + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , reference_type update + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( i , update ); + } + } + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( ! Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , reference_type update + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( typename PType::work_tag() , i , update ); + } + } + + static void execute( QthreadExec & exec , const void * arg ) + { + const ParallelReduce & self = * ((const ParallelReduce *) arg ); + + driver( self.m_func + , ValueInit::init( self.m_func , exec.exec_all_reduce_value() ) + , typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() ) + ); + + exec.template exec_all_reduce<FunctorType, typename Policy::work_tag >( self.m_func ); + } + +public: + + template< class HostViewType > + ParallelReduce( const FunctorType & functor + , const Policy & policy + , const HostViewType & result_view ) + : m_func( functor ) + , m_policy( policy ) + { + QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 ); + + Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this ); + + const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result(); + + Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data ); + + if ( result_view.ptr_on_device() ) { + const unsigned n = ValueTraits::value_count( m_func ); + for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; } + } + } +}; + +//---------------------------------------------------------------------------- + +template< class FunctorType , class Arg0 , class Arg1 > +class ParallelFor< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > > +{ +private: + + typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > Policy ; + + const FunctorType m_func ; + const Policy m_team ; + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION + void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member ) const + { m_func( member ); } + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION + void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member ) const + { m_func( TagType() , member ); } + + static void execute( QthreadExec & exec , const void * arg ) + { + const ParallelFor & self = * ((const ParallelFor *) arg ); + + typename Policy::member_type member( exec , self.m_team ); + + while ( member ) { + self.ParallelFor::template driver< typename Policy::work_tag >( member ); + member.team_barrier(); + member.next_team(); + } + + exec.exec_all_barrier(); + } + +public: + + ParallelFor( const FunctorType & functor , + const Policy & policy ) + : m_func( functor ) + , m_team( policy ) + { + QthreadExec::resize_worker_scratch + ( /* reduction memory */ 0 + , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) ); + + Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this ); + } +}; + +//---------------------------------------------------------------------------- + +template< class FunctorType , class Arg0 , class Arg1 > +class ParallelReduce< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > > +{ +private: + + typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > Policy ; + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + const FunctorType m_func ; + const Policy m_team ; + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION + void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member + , reference_type update ) const + { m_func( member , update ); } + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION + void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member + , reference_type update ) const + { m_func( TagType() , member , update ); } + + static void execute( QthreadExec & exec , const void * arg ) + { + const ParallelReduce & self = * ((const ParallelReduce *) arg ); + + // Initialize thread-local value + reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() ); + + typename Policy::member_type member( exec , self.m_team ); + + while ( member ) { + self.ParallelReduce::template driver< typename Policy::work_tag >( member , update ); + member.team_barrier(); + member.next_team(); + } + + exec.template exec_all_reduce< FunctorType , typename Policy::work_tag >( self.m_func ); + } + +public: + + template< class ViewType > + ParallelReduce( const FunctorType & functor , + const Policy & policy , + const ViewType & result ) + : m_func( functor ) + , m_team( policy ) + { + QthreadExec::resize_worker_scratch + ( /* reduction memory */ ValueTraits::value_size( functor ) + , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) ); + + Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this ); + + const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result(); + + Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data ); + + const unsigned n = ValueTraits::value_count( m_func ); + for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; } + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + const FunctorType m_func ; + const Policy m_policy ; + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , reference_type update + , const bool final + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( i , update , final ); + } + } + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( ! Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , reference_type update + , const bool final + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( typename PType::work_tag() , i , update , final ); + } + } + + static void execute( QthreadExec & exec , const void * arg ) + { + const ParallelScan & self = * ((const ParallelScan *) arg ); + + const typename Policy::WorkRange range( self.m_policy , exec.worker_rank() , exec.worker_size() ); + + // Initialize thread-local value + reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() ); + + driver( self.m_func , update , false , range ); + + exec.template exec_all_scan< FunctorType , typename Policy::work_tag >( self.m_func ); + + driver( self.m_func , update , true , range ); + + exec.exec_all_barrier(); + } + +public: + + ParallelScan( const FunctorType & functor + , const Policy & policy + ) + : m_func( functor ) + , m_policy( policy ) + { + QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 ); + + Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::execute , this ); + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember> +TeamThreadRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>(thread,count); +} + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember> +TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread + , const iType & begin + , const iType & end + ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>(thread,begin,end); +} + + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember > + ThreadVectorRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count) { + return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >(thread,count); +} + + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember> PerTeam(const Impl::QthreadTeamPolicyMember& thread) { + return Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::QthreadTeamPolicyMember& thread) { + return Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>(thread); +} + +} // namespace Kokkos + +namespace Kokkos { + + /** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + * This functionality requires C++11 support.*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, const Lambda& lambda) { + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i); +} + +/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of + * val is performed and put into result. This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, + const Lambda & lambda, ValueType& result) { + + result = ValueType(); + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + result+=tmp; + } + + result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>()); +} + +#if defined( KOKKOS_HAVE_CXX11 ) + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of + * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result. + * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore + * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or + * '1 for *'). This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, + const Lambda & lambda, const JoinType& join, ValueType& init_result) { + + ValueType result = init_result; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + join(result,tmp); + } + + init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join)); +} + +#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */ + +} // namespace Kokkos + +namespace Kokkos { +/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. + * This functionality requires C++11 support.*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >& + loop_boundaries, const Lambda& lambda) { + #ifdef KOKKOS_HAVE_PRAGMA_IVDEP + #pragma ivdep + #endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i); +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of + * val is performed and put into result. This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >& + loop_boundaries, const Lambda & lambda, ValueType& result) { + result = ValueType(); +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + result+=tmp; + } +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of + * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result. + * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore + * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or + * '1 for *'). This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >& + loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) { + + ValueType result = init_result; +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + join(result,tmp); + } + init_result = result; +} + +/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final) + * for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed. + * Depending on the target execution space the operator might be called twice: once with final=false + * and once with final=true. When final==true val contains the prefix sum value. The contribution of this + * "i" needs to be added to val no matter whether final==true or not. In a serial execution + * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set + * to the final sum value over all vector lanes. + * This functionality requires C++11 support.*/ +template< typename iType, class FunctorType > +KOKKOS_INLINE_FUNCTION +void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >& + loop_boundaries, const FunctorType & lambda) { + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef typename ValueTraits::value_type value_type ; + + value_type scan_val = value_type(); + +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,scan_val,true); + } +} + +} // namespace Kokkos + +namespace Kokkos { + +template<class FunctorType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) { + lambda(); +} + +template<class FunctorType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) { + if(single_struct.team_member.team_rank()==0) lambda(); +} + +template<class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) { + lambda(val); +} + +template<class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) { + if(single_struct.team_member.team_rank()==0) { + lambda(val); + } + single_struct.team_member.team_broadcast(val,0); +} + +} // namespace Kokkos + + +#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */ + diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp new file mode 100755 index 0000000000000000000000000000000000000000..9787d2646296568caca3dccef39d06ee1bbaef55 --- /dev/null +++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp @@ -0,0 +1,451 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#include <Kokkos_Core_fwd.hpp> + +#if defined( KOKKOS_HAVE_QTHREAD ) + +#include <stdio.h> + +#include <stdlib.h> +#include <stdexcept> +#include <iostream> +#include <sstream> +#include <string> + +#include <Kokkos_Atomic.hpp> +#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +typedef TaskMember< Kokkos::Qthread , void , void > Task ; + +namespace { + +inline +unsigned padded_sizeof_derived( unsigned sizeof_derived ) +{ + return sizeof_derived + + ( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 ); +} + +// int lock_alloc_dealloc = 0 ; + +} // namespace + +void Task::deallocate( void * ptr ) +{ + // Counting on 'free' thread safety so lock/unlock not required. + // However, isolate calls here to mitigate future need to introduce lock/unlock. + + // lock + + // while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) ); + + free( ptr ); + + // unlock + + // Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 ); +} + +void * Task::allocate( const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity ) +{ + // Counting on 'malloc' thread safety so lock/unlock not required. + // However, isolate calls here to mitigate future need to introduce lock/unlock. + + // lock + + // while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) ); + + void * const ptr = malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) ); + + // unlock + + // Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 ); + + return ptr ; +} + +Task::~TaskMember() +{ + +} + + +Task::TaskMember( const function_verify_type arg_verify + , const function_dealloc_type arg_dealloc + , const function_apply_single_type arg_apply_single + , const function_apply_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + ) + : m_dealloc( arg_dealloc ) + , m_verify( arg_verify ) + , m_apply_single( arg_apply_single ) + , m_apply_team( arg_apply_team ) + , m_active_count( & arg_active_count ) + , m_qfeb(0) + , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) ) + , m_dep_capacity( arg_dependence_capacity ) + , m_dep_size( 0 ) + , m_ref_count( 0 ) + , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING ) +{ + qthread_empty( & m_qfeb ); // Set to full when complete + for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ; +} + +Task::TaskMember( const function_dealloc_type arg_dealloc + , const function_apply_single_type arg_apply_single + , const function_apply_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + ) + : m_dealloc( arg_dealloc ) + , m_verify( & Task::verify_type<void> ) + , m_apply_single( arg_apply_single ) + , m_apply_team( arg_apply_team ) + , m_active_count( & arg_active_count ) + , m_qfeb(0) + , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) ) + , m_dep_capacity( arg_dependence_capacity ) + , m_dep_size( 0 ) + , m_ref_count( 0 ) + , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING ) +{ + qthread_empty( & m_qfeb ); // Set to full when complete + for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ; +} + +//---------------------------------------------------------------------------- + +void Task::throw_error_add_dependence() const +{ + std::cerr << "TaskMember< Qthread >::add_dependence ERROR" + << " state(" << m_state << ")" + << " dep_size(" << m_dep_size << ")" + << std::endl ; + throw std::runtime_error("TaskMember< Qthread >::add_dependence ERROR"); +} + +void Task::throw_error_verify_type() +{ + throw std::runtime_error("TaskMember< Qthread >::verify_type ERROR"); +} + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) +void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw ) +{ + static const char msg_error_header[] = "Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR" ; + static const char msg_error_count[] = ": negative reference count" ; + static const char msg_error_complete[] = ": destroy task that is not complete" ; + static const char msg_error_dependences[] = ": destroy task that has dependences" ; + static const char msg_error_exception[] = ": caught internal exception" ; + + if ( rhs ) { Kokkos::atomic_fetch_add( & (*rhs).m_ref_count , 1 ); } + + Task * const lhs_val = Kokkos::atomic_exchange( lhs , rhs ); + + if ( lhs_val ) { + + const int count = Kokkos::atomic_fetch_add( & (*lhs_val).m_ref_count , -1 ); + + const char * msg_error = 0 ; + + try { + + if ( 1 == count ) { + + // Reference count at zero, delete it + + // Should only be deallocating a completed task + if ( (*lhs_val).m_state == Kokkos::Experimental::TASK_STATE_COMPLETE ) { + + // A completed task should not have dependences... + for ( int i = 0 ; i < (*lhs_val).m_dep_size && 0 == msg_error ; ++i ) { + if ( (*lhs_val).m_dep[i] ) msg_error = msg_error_dependences ; + } + } + else { + msg_error = msg_error_complete ; + } + + if ( 0 == msg_error ) { + // Get deletion function and apply it + const Task::function_dealloc_type d = (*lhs_val).m_dealloc ; + + (*d)( lhs_val ); + } + } + else if ( count <= 0 ) { + msg_error = msg_error_count ; + } + } + catch( ... ) { + if ( 0 == msg_error ) msg_error = msg_error_exception ; + } + + if ( 0 != msg_error ) { + if ( no_throw ) { + std::cerr << msg_error_header << msg_error << std::endl ; + std::cerr.flush(); + } + else { + std::string msg(msg_error_header); + msg.append(msg_error); + throw std::runtime_error( msg ); + } + } + } +} +#endif + + +//---------------------------------------------------------------------------- + +aligned_t Task::qthread_func( void * arg ) +{ + Task * const task = reinterpret_cast< Task * >(arg); + + // First member of the team change state to executing. + // Use compare-exchange to avoid race condition with a respawn. + Kokkos::atomic_compare_exchange_strong( & task->m_state + , int(Kokkos::Experimental::TASK_STATE_WAITING) + , int(Kokkos::Experimental::TASK_STATE_EXECUTING) + ); + + // It is a single thread's responsibility to close out + // this task's execution. + bool close_out = false ; + + if ( task->m_apply_team ) { + + Kokkos::Impl::QthreadTeamPolicyMember member ; + + (*task->m_apply_team)( task , member ); + +fprintf( stdout + , "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n" + , qthread_shep() + , qthread_worker_local(NULL) + , reinterpret_cast<unsigned long>(task) + , member.team_rank() + , member.team_size() + ); +fflush(stdout); + + member.team_barrier(); + + close_out = member.team_rank() == 0 ; + } + else { + (*task->m_apply_single)( task ); + + close_out = true ; + } + + if ( close_out ) { + + // When dependent tasks run there would be a race + // condition between destroying this task and + // querying the active count pointer from this task. + int volatile * active_count = task->m_active_count ; + + if ( task->m_state == ( Kokkos::Experimental::TASK_STATE_WAITING | Kokkos::Experimental::TASK_STATE_EXECUTING ) ) { + +#if 0 +fprintf( stdout + , "worker(%d.%d) task 0x%.12lx respawn\n" + , qthread_shep() + , qthread_worker_local(NULL) + , reinterpret_cast<unsigned long>(task) + ); +fflush(stdout); +#endif + + // Task respawned, set state to waiting and reschedule the task + task->m_state = Kokkos::Experimental::TASK_STATE_WAITING ; + task->schedule(); + } + else { + + // Task did not respawn, is complete + task->m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ; + + // Release dependences before allowing dependent tasks to run. + // Otherwise there is a thread race condition for removing dependences. + for ( int i = 0 ; i < task->m_dep_size ; ++i ) { + assign( & task->m_dep[i] , 0 ); + } + + // Set qthread FEB to full so that dependent tasks are allowed to execute. + // This 'task' may be deleted immediately following this function call. + qthread_fill( & task->m_qfeb ); + } + + // Decrement active task count before returning. + Kokkos::atomic_decrement( active_count ); + } + +#if 0 +fprintf( stdout + , "worker(%d.%d) task 0x%.12lx return\n" + , qthread_shep() + , qthread_worker_local(NULL) + , reinterpret_cast<unsigned long>(task) + ); +fflush(stdout); +#endif + + return 0 ; +} + +void Task::respawn() +{ + // Change state from pure executing to ( waiting | executing ) + // to avoid confusion with simply waiting. + Kokkos::atomic_compare_exchange_strong( & m_state + , int(Kokkos::Experimental::TASK_STATE_EXECUTING) + , int(Kokkos::Experimental::TASK_STATE_WAITING | + Kokkos::Experimental::TASK_STATE_EXECUTING) + ); +} + +void Task::schedule() +{ + // Is waiting for execution + + // Increment active task count before spawning. + Kokkos::atomic_increment( m_active_count ); + + // spawn in qthread. must malloc the precondition array and give to qthread. + // qthread will eventually free this allocation so memory will not be leaked. + + // concern with thread safety of malloc, does this need to be guarded? + aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) ); + + qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) ); + + for ( int i = 0 ; i < m_dep_size ; ++i ) { + qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag + } + + if ( m_apply_single ) { + qthread_spawn( & Task::qthread_func /* function */ + , this /* function argument */ + , 0 + , NULL + , m_dep_size , qprecon /* dependences */ + , NO_SHEPHERD + , QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */ + ); + } + else { + // If more than one shepherd spawn on a shepherd other than this shepherd + const int num_shepherd = qthread_num_shepherds(); + const int num_worker_per_shepherd = qthread_num_workers_local(NO_SHEPHERD); + const int this_shepherd = qthread_shep(); + + int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ; + +fprintf( stdout + , "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n" + , qthread_shep() + , qthread_worker_local(NULL) + , reinterpret_cast<unsigned long>(this) + , spawn_shepherd + , num_worker_per_shepherd - 1 + ); +fflush(stdout); + + qthread_spawn_cloneable + ( & Task::qthread_func + , this + , 0 + , NULL + , m_dep_size , qprecon /* dependences */ + , spawn_shepherd + // , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ) + , unsigned( QTHREAD_SPAWN_LOCAL_PRIORITY ) + , num_worker_per_shepherd - 1 + ); + } +} + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +namespace Kokkos { +namespace Experimental { + +TaskPolicy< Kokkos::Qthread >::member_type & +TaskPolicy< Kokkos::Qthread >::member_single() +{ + static member_type s ; + return s ; +} + +void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy ) +{ + volatile int * const active_task_count = & policy.m_active_count ; + while ( *active_task_count ) qthread_yield(); +} + +} // namespace Experimental +} // namespace Kokkos + +#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */ + diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp new file mode 100755 index 0000000000000000000000000000000000000000..af44b62a1977d59ca20b01ad6d819b654219e688 --- /dev/null +++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp @@ -0,0 +1,646 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_QTHREAD_TASKPOLICY_HPP +#define KOKKOS_QTHREAD_TASKPOLICY_HPP + +#include <string> +#include <typeinfo> +#include <stdexcept> + +//---------------------------------------------------------------------------- +// Defines to enable experimental Qthread functionality + +#define QTHREAD_LOCAL_PRIORITY +#define CLONED_TASKS + +#include <qthread.h> + +#undef QTHREAD_LOCAL_PRIORITY +#undef CLONED_TASKS + +//---------------------------------------------------------------------------- + +#include <Kokkos_Qthread.hpp> +#include <Kokkos_TaskPolicy.hpp> +#include <Kokkos_View.hpp> + +#include <impl/Kokkos_FunctorAdapter.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template<> +class TaskMember< Kokkos::Qthread , void , void > +{ +public: + + typedef void (* function_apply_single_type) ( TaskMember * ); + typedef void (* function_apply_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & ); + typedef void (* function_dealloc_type)( TaskMember * ); + typedef TaskMember * (* function_verify_type) ( TaskMember * ); + +private: + + const function_dealloc_type m_dealloc ; ///< Deallocation + const function_verify_type m_verify ; ///< Result type verification + const function_apply_single_type m_apply_single ; ///< Apply function + const function_apply_team_type m_apply_team ; ///< Apply function + int volatile * const m_active_count ; ///< Count of active tasks on this policy + aligned_t m_qfeb ; ///< Qthread full/empty bit + TaskMember ** const m_dep ; ///< Dependences + const int m_dep_capacity ; ///< Capacity of dependences + int m_dep_size ; ///< Actual count of dependences + int m_ref_count ; ///< Reference count + int m_state ; ///< State of the task + + TaskMember() /* = delete */ ; + TaskMember( const TaskMember & ) /* = delete */ ; + TaskMember & operator = ( const TaskMember & ) /* = delete */ ; + + static aligned_t qthread_func( void * arg ); + + static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity ); + static void deallocate( void * ); + + void throw_error_add_dependence() const ; + static void throw_error_verify_type(); + + template < class DerivedTaskType > + static + void deallocate( TaskMember * t ) + { + DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t); + ptr->~DerivedTaskType(); + deallocate( (void *) ptr ); + } + + void schedule(); + +protected : + + ~TaskMember(); + + // Used by TaskMember< Qthread , ResultType , void > + TaskMember( const function_verify_type arg_verify + , const function_dealloc_type arg_dealloc + , const function_apply_single_type arg_apply_single + , const function_apply_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + ); + + // Used for TaskMember< Qthread , void , void > + TaskMember( const function_dealloc_type arg_dealloc + , const function_apply_single_type arg_apply_single + , const function_apply_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + ); + +public: + + template< typename ResultType > + KOKKOS_FUNCTION static + TaskMember * verify_type( TaskMember * t ) + { + enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value }; + + if ( check_type && t != 0 ) { + + // Verify that t->m_verify is this function + const function_verify_type self = & TaskMember::template verify_type< ResultType > ; + + if ( t->m_verify != self ) { + t = 0 ; +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + throw_error_verify_type(); +#endif + } + } + return t ; + } + + //---------------------------------------- + /* Inheritence Requirements on task types: + * typedef FunctorType::value_type value_type ; + * class DerivedTaskType + * : public TaskMember< Qthread , value_type , FunctorType > + * { ... }; + * class TaskMember< Qthread , value_type , FunctorType > + * : public TaskMember< Qthread , value_type , void > + * , public Functor + * { ... }; + * If value_type != void + * class TaskMember< Qthread , value_type , void > + * : public TaskMember< Qthread , void , void > + * + * Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ] + * + */ + + /** \brief Allocate and construct a single-thread task */ + template< class DerivedTaskType > + static + TaskMember * create_single( const typename DerivedTaskType::functor_type & arg_functor + , volatile int & arg_active_count + , const unsigned arg_dependence_capacity ) + { + typedef typename DerivedTaskType::functor_type functor_type ; + typedef typename functor_type::value_type value_type ; + + DerivedTaskType * const task = + new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) ) + DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType > + , & TaskMember::template apply_single< functor_type , value_type > + , 0 + , arg_active_count + , sizeof(DerivedTaskType) + , arg_dependence_capacity + , arg_functor ); + + return static_cast< TaskMember * >( task ); + } + + /** \brief Allocate and construct a team-thread task */ + template< class DerivedTaskType > + static + TaskMember * create_team( const typename DerivedTaskType::functor_type & arg_functor + , volatile int & arg_active_count + , const unsigned arg_dependence_capacity ) + { + typedef typename DerivedTaskType::functor_type functor_type ; + typedef typename functor_type::value_type value_type ; + + DerivedTaskType * const task = + new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) ) + DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType > + , 0 + , & TaskMember::template apply_team< functor_type , value_type > + , arg_active_count + , sizeof(DerivedTaskType) + , arg_dependence_capacity + , arg_functor ); + + return static_cast< TaskMember * >( task ); + } + + void respawn(); + void spawn() + { + m_state = Kokkos::Experimental::TASK_STATE_WAITING ; + schedule(); + } + + //---------------------------------------- + + typedef FutureValueTypeIsVoidError get_result_type ; + + KOKKOS_INLINE_FUNCTION + get_result_type get() const { return get_result_type() ; } + + KOKKOS_INLINE_FUNCTION + Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); } + + //---------------------------------------- + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + static + void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ); +#else + KOKKOS_INLINE_FUNCTION static + void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {} +#endif + + KOKKOS_INLINE_FUNCTION + TaskMember * get_dependence( int i ) const + { return ( Kokkos::Experimental::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; } + + KOKKOS_INLINE_FUNCTION + int get_dependence() const + { return m_dep_size ; } + + KOKKOS_INLINE_FUNCTION + void clear_dependence() + { + for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 ); + m_dep_size = 0 ; + } + + KOKKOS_INLINE_FUNCTION + void add_dependence( TaskMember * before ) + { + if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == m_state || + Kokkos::Experimental::TASK_STATE_EXECUTING == m_state ) && + m_dep_size < m_dep_capacity ) { + assign( m_dep + m_dep_size , before ); + ++m_dep_size ; + } + else { + throw_error_add_dependence(); + } + } + + //---------------------------------------- + + template< class FunctorType , class ResultType > + KOKKOS_INLINE_FUNCTION static + void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t ) + { + typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ; + + // TaskMember< Kokkos::Qthread , ResultType , FunctorType > + // : public TaskMember< Kokkos::Qthread , ResultType , void > + // , public FunctorType + // { ... }; + + derived_type & m = * static_cast< derived_type * >( t ); + + Kokkos::Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result ); + } + + template< class FunctorType , class ResultType > + KOKKOS_INLINE_FUNCTION static + void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t ) + { + typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ; + + // TaskMember< Kokkos::Qthread , ResultType , FunctorType > + // : public TaskMember< Kokkos::Qthread , ResultType , void > + // , public FunctorType + // { ... }; + + derived_type & m = * static_cast< derived_type * >( t ); + + Kokkos::Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m ); + } + + //---------------------------------------- + + template< class FunctorType , class ResultType > + KOKKOS_INLINE_FUNCTION static + void apply_team( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t + , Kokkos::Impl::QthreadTeamPolicyMember & member ) + { + typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ; + + derived_type & m = * static_cast< derived_type * >( t ); + + m.FunctorType::apply( member , m.m_result ); + } + + template< class FunctorType , class ResultType > + KOKKOS_INLINE_FUNCTION static + void apply_team( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t + , Kokkos::Impl::QthreadTeamPolicyMember & member ) + { + typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ; + + derived_type & m = * static_cast< derived_type * >( t ); + + m.FunctorType::apply( member ); + } +}; + +//---------------------------------------------------------------------------- +/** \brief Base class for tasks with a result value in the Qthread execution space. + * + * The FunctorType must be void because this class is accessed by the + * Future class for the task and result value. + * + * Must be derived from TaskMember<S,void,void> 'root class' so the Future class + * can correctly static_cast from the 'root class' to this class. + */ +template < class ResultType > +class TaskMember< Kokkos::Qthread , ResultType , void > + : public TaskMember< Kokkos::Qthread , void , void > +{ +public: + + ResultType m_result ; + + typedef const ResultType & get_result_type ; + + KOKKOS_INLINE_FUNCTION + get_result_type get() const { return m_result ; } + +protected: + + typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ; + typedef task_root_type::function_dealloc_type function_dealloc_type ; + typedef task_root_type::function_apply_single_type function_apply_single_type ; + typedef task_root_type::function_apply_team_type function_apply_team_type ; + + inline + TaskMember( const function_dealloc_type arg_dealloc + , const function_apply_single_type arg_apply_single + , const function_apply_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + ) + : task_root_type( & task_root_type::template verify_type< ResultType > + , arg_dealloc + , arg_apply_single + , arg_apply_team + , arg_active_count + , arg_sizeof_derived + , arg_dependence_capacity ) + , m_result() + {} +}; + +template< class ResultType , class FunctorType > +class TaskMember< Kokkos::Qthread , ResultType , FunctorType > + : public TaskMember< Kokkos::Qthread , ResultType , void > + , public FunctorType +{ +public: + + typedef FunctorType functor_type ; + + typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ; + typedef TaskMember< Kokkos::Qthread , ResultType , void > task_base_type ; + typedef task_root_type::function_dealloc_type function_dealloc_type ; + typedef task_root_type::function_apply_single_type function_apply_single_type ; + typedef task_root_type::function_apply_team_type function_apply_team_type ; + + inline + TaskMember( const function_dealloc_type arg_dealloc + , const function_apply_single_type arg_apply_single + , const function_apply_team_type arg_apply_team + , volatile int & arg_active_count + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + , const functor_type & arg_functor + ) + : task_base_type( arg_dealloc + , arg_apply_single + , arg_apply_team + , arg_active_count + , arg_sizeof_derived + , arg_dependence_capacity ) + , functor_type( arg_functor ) + {} +}; + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +void wait( TaskPolicy< Kokkos::Qthread > & ); + +template<> +class TaskPolicy< Kokkos::Qthread > +{ +public: + + typedef Kokkos::Qthread execution_space ; + typedef Kokkos::Impl::QthreadTeamPolicyMember member_type ; + +private: + + typedef Impl::TaskMember< execution_space , void , void > task_root_type ; + + TaskPolicy & operator = ( const TaskPolicy & ) /* = delete */ ; + + template< class FunctorType > + static inline + const task_root_type * get_task_root( const FunctorType * f ) + { + typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ; + return static_cast< const task_root_type * >( static_cast< const task_type * >(f) ); + } + + template< class FunctorType > + static inline + task_root_type * get_task_root( FunctorType * f ) + { + typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ; + return static_cast< task_root_type * >( static_cast< task_type * >(f) ); + } + + const unsigned m_default_dependence_capacity ; + volatile int m_active_count_root ; + volatile int & m_active_count ; + +public: + + KOKKOS_INLINE_FUNCTION + TaskPolicy() + : m_default_dependence_capacity(4) + , m_active_count_root(0) + , m_active_count( m_active_count_root ) + {} + + KOKKOS_INLINE_FUNCTION + explicit + TaskPolicy( const unsigned arg_default_dependence_capacity ) + : m_default_dependence_capacity( arg_default_dependence_capacity ) + , m_active_count_root(0) + , m_active_count( m_active_count_root ) + {} + + KOKKOS_INLINE_FUNCTION + TaskPolicy( const TaskPolicy & rhs ) + : m_default_dependence_capacity( rhs.m_default_dependence_capacity ) + , m_active_count_root(0) + , m_active_count( rhs.m_active_count ) + {} + + KOKKOS_INLINE_FUNCTION + TaskPolicy( const TaskPolicy & rhs + , const unsigned arg_default_dependence_capacity ) + : m_default_dependence_capacity( arg_default_dependence_capacity ) + , m_active_count_root(0) + , m_active_count( rhs.m_active_count ) + {} + + //---------------------------------------- + + template< class ValueType > + const Future< ValueType , execution_space > & + spawn( const Future< ValueType , execution_space > & f ) const + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + f.m_task->spawn(); +#endif + return f ; + } + + // Create single-thread task + + template< class FunctorType > + Future< typename FunctorType::value_type , execution_space > + create( const FunctorType & functor + , const unsigned dependence_capacity = ~0u ) const + { + typedef typename FunctorType::value_type value_type ; + typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ; + return Future< value_type , execution_space >( +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + task_root_type::create_single< task_type > + ( functor + , m_active_count + , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) + ) +#endif + ); + } + + // Create thread-team task + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + Future< typename FunctorType::value_type , execution_space > + create_team( const FunctorType & functor + , const unsigned dependence_capacity = ~0u ) const + { + typedef typename FunctorType::value_type value_type ; + typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ; + + return Future< value_type , execution_space >( +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + task_root_type::create_team< task_type > + ( functor + , m_active_count + , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) + ) +#endif + ); + } + + // Add dependence + template< class A1 , class A2 , class A3 , class A4 > + void add_dependence( const Future<A1,A2> & after + , const Future<A3,A4> & before + , typename Kokkos::Impl::enable_if + < Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value + && + Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value + >::type * = 0 + ) + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + after.m_task->add_dependence( before.m_task ); +#endif + } + + //---------------------------------------- + // Functions for an executing task functor to query dependences, + // set new dependences, and respawn itself. + + template< class FunctorType > + Future< void , execution_space > + get_dependence( const FunctorType * task_functor , int i ) const + { + return Future<void,execution_space>( +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + get_task_root(task_functor)->get_dependence(i) +#endif + ); + } + + template< class FunctorType > + int get_dependence( const FunctorType * task_functor ) const +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return get_task_root(task_functor)->get_dependence(); } +#else + { return 0 ; } +#endif + + template< class FunctorType > + void clear_dependence( FunctorType * task_functor ) const + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + get_task_root(task_functor)->clear_dependence(); +#endif + } + + template< class FunctorType , class A3 , class A4 > + void add_dependence( FunctorType * task_functor + , const Future<A3,A4> & before + , typename Kokkos::Impl::enable_if + < Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value + >::type * = 0 + ) + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + get_task_root(task_functor)->add_dependence( before.m_task ); +#endif + } + + template< class FunctorType > + void respawn( FunctorType * task_functor ) const +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { get_task_root(task_functor)->respawn(); } +#else + {} +#endif + + static member_type & member_single(); + + friend void wait( TaskPolicy< Kokkos::Qthread > & ); +}; + +} /* namespace Experimental */ +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #define KOKKOS_QTHREAD_TASK_HPP */ + diff --git a/lib/kokkos/core/src/Qthread/README b/lib/kokkos/core/src/Qthread/README new file mode 100755 index 0000000000000000000000000000000000000000..5d8f29a4ee706d813fe344c35d4ad1c96bfbb024 --- /dev/null +++ b/lib/kokkos/core/src/Qthread/README @@ -0,0 +1,28 @@ + +# This Qthreads back-end uses an experimental branch of the Qthreads repository with special #define options. + +# Cloning repository and branch: + +git clone https://github.com/stelleg/qthreads qthreads-with-clone + +cd qthreads-with-clone + +# Added to ./git/config +# +# [branch "cloned_tasks"] +# remote = origin +# merge = refs/heads/cloned_tasks +# + +git branch cloned_tasks +git checkout cloned_tasks +git pull + +sh autogen.sh + +# configurure with 'hwloc' installation: + +./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR} + + + diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp new file mode 100755 index 0000000000000000000000000000000000000000..99553fccb1fae82678b5b6e938a41f08859b0921 --- /dev/null +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp @@ -0,0 +1,758 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core_fwd.hpp> + +#if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD ) + +#include <stdint.h> +#include <limits> +#include <utility> +#include <iostream> +#include <sstream> +#include <Kokkos_Threads.hpp> +#include <Kokkos_hwloc.hpp> +#include <Kokkos_Atomic.hpp> +#include <impl/Kokkos_Error.hpp> + + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { +namespace { + +ThreadsExec s_threads_process ; +ThreadsExec * s_threads_exec[ ThreadsExec::MAX_THREAD_COUNT ] = { 0 }; +pthread_t s_threads_pid[ ThreadsExec::MAX_THREAD_COUNT ] = { 0 }; +std::pair<unsigned,unsigned> s_threads_coord[ ThreadsExec::MAX_THREAD_COUNT ]; + +int s_thread_pool_size[3] = { 0 , 0 , 0 }; + +unsigned s_current_reduce_size = 0 ; +unsigned s_current_shared_size = 0 ; + +void (* volatile s_current_function)( ThreadsExec & , const void * ); +const void * volatile s_current_function_arg = 0 ; + +struct Sentinel { + Sentinel() + { + HostSpace::register_in_parallel( ThreadsExec::in_parallel ); + } + + ~Sentinel() + { + if ( s_thread_pool_size[0] || + s_thread_pool_size[1] || + s_thread_pool_size[2] || + s_current_reduce_size || + s_current_shared_size || + s_current_function || + s_current_function_arg || + s_threads_exec[0] ) { + std::cerr << "ERROR : Process exiting without calling Kokkos::Threads::terminate()" << std::endl ; + } + } +}; + +inline +unsigned fan_size( const unsigned rank , const unsigned size ) +{ + const unsigned rank_rev = size - ( rank + 1 ); + unsigned count = 0 ; + for ( unsigned n = 1 ; ( rank_rev + n < size ) && ! ( rank_rev & n ) ; n <<= 1 ) { ++count ; } + return count ; +} + +} // namespace +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +void execute_function_noop( ThreadsExec & , const void * ) {} + +void ThreadsExec::driver(void) +{ + ThreadsExec this_thread ; + + while ( ThreadsExec::Active == this_thread.m_pool_state ) { + + (*s_current_function)( this_thread , s_current_function_arg ); + + // Deactivate thread and wait for reactivation + this_thread.m_pool_state = ThreadsExec::Inactive ; + + wait_yield( this_thread.m_pool_state , ThreadsExec::Inactive ); + } +} + +ThreadsExec::ThreadsExec() + : m_pool_base(0) + , m_scratch() + , m_scratch_reduce_end(0) + , m_scratch_thread_end(0) + , m_numa_rank(0) + , m_numa_core_rank(0) + , m_pool_rank(0) + , m_pool_size(0) + , m_pool_fan_size(0) + , m_pool_state( ThreadsExec::Terminating ) +{ + if ( & s_threads_process != this ) { + + // A spawned thread + + ThreadsExec * const nil = 0 ; + + // Which entry in 's_threads_exec', possibly determined from hwloc binding + const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0]) + ? ((size_t)s_current_function_arg) + : size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord )); + + // Given a good entry set this thread in the 's_threads_exec' array + if ( entry < s_thread_pool_size[0] && + nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) { + + const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate(); + + m_numa_rank = coord.first ; + m_numa_core_rank = coord.second ; + m_pool_base = s_threads_exec ; + m_pool_rank = s_thread_pool_size[0] - ( entry + 1 ); + m_pool_size = s_thread_pool_size[0] ; + m_pool_fan_size = fan_size( m_pool_rank , m_pool_size ); + m_pool_state = ThreadsExec::Active ; + + s_threads_pid[ m_pool_rank ] = pthread_self(); + + // Inform spawning process that the threads_exec entry has been set. + s_threads_process.m_pool_state = ThreadsExec::Active ; + } + else { + // Inform spawning process that the threads_exec entry could not be set. + s_threads_process.m_pool_state = ThreadsExec::Terminating ; + } + } + else { + // Enables 'parallel_for' to execute on unitialized Threads device + m_pool_rank = 0 ; + m_pool_size = 1 ; + m_pool_state = ThreadsExec::Inactive ; + + s_threads_pid[ m_pool_rank ] = pthread_self(); + } +} + +ThreadsExec::~ThreadsExec() +{ + const unsigned entry = m_pool_size - ( m_pool_rank + 1 ); + + m_pool_base = 0 ; + m_scratch.clear(); + m_scratch_reduce_end = 0 ; + m_scratch_thread_end = 0 ; + m_numa_rank = 0 ; + m_numa_core_rank = 0 ; + m_pool_rank = 0 ; + m_pool_size = 0 ; + m_pool_fan_size = 0 ; + + m_pool_state = ThreadsExec::Terminating ; + + if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) { + ThreadsExec * const nil = 0 ; + + atomic_compare_exchange( s_threads_exec + entry , this , nil ); + + s_threads_process.m_pool_state = ThreadsExec::Terminating ; + } +} + + +int ThreadsExec::get_thread_count() +{ + return s_thread_pool_size[0] ; +} + +ThreadsExec * ThreadsExec::get_thread( const int init_thread_rank ) +{ + ThreadsExec * const th = + init_thread_rank < s_thread_pool_size[0] + ? s_threads_exec[ s_thread_pool_size[0] - ( init_thread_rank + 1 ) ] : 0 ; + + if ( 0 == th || th->m_pool_rank != init_thread_rank ) { + std::ostringstream msg ; + msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : " + << "thread " << init_thread_rank << " of " << s_thread_pool_size[0] ; + if ( 0 == th ) { + msg << " does not exist" ; + } + else { + msg << " has wrong thread_rank " << th->m_pool_rank ; + } + Kokkos::Impl::throw_runtime_exception( msg.str() ); + } + + return th ; +} + +//---------------------------------------------------------------------------- + +void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * ) +{ + ThreadsExec::global_lock(); + ThreadsExec::global_unlock(); + + const int n = exec.m_pool_fan_size ; + const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 ); + + for ( int i = 0 ; i < n ; ++i ) { + Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active ); + } + + exec.m_pool_state = ThreadsExec::Inactive ; +} + +} +} + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +void ThreadsExec::verify_is_process( const std::string & name , const bool initialized ) +{ + if ( ! is_process() ) { + std::string msg( name ); + msg.append( " FAILED : Called by a worker thread, can only be called by the master process." ); + Kokkos::Impl::throw_runtime_exception( msg ); + } + + if ( initialized && 0 == s_thread_pool_size[0] ) { + std::string msg( name ); + msg.append( " FAILED : Threads not initialized." ); + Kokkos::Impl::throw_runtime_exception( msg ); + } +} + +int ThreadsExec::in_parallel() +{ + // A thread function is in execution and + // the function argument is not the special threads process argument and + // the master process is a worker or is not the master process. + return s_current_function && + ( & s_threads_process != s_current_function_arg ) && + ( s_threads_process.m_pool_base || ! is_process() ); +} + +// Wait for root thread to become inactive +void ThreadsExec::fence() +{ + if ( s_thread_pool_size[0] ) { + // Wait for the root thread to complete: + Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active ); + } + + s_current_function = 0 ; + s_current_function_arg = 0 ; +} + +/** \brief Begin execution of the asynchronous functor */ +void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg ) +{ + verify_is_process("ThreadsExec::start" , true ); + + if ( s_current_function || s_current_function_arg ) { + Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) ); + } + + s_current_function = func ; + s_current_function_arg = arg ; + + // Activate threads: + for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) { + s_threads_exec[i]->m_pool_state = ThreadsExec::Active ; + } + + if ( s_threads_process.m_pool_size ) { + // Master process is the root thread, run it: + (*func)( s_threads_process , arg ); + s_threads_process.m_pool_state = ThreadsExec::Inactive ; + } +} + +//---------------------------------------------------------------------------- + +bool ThreadsExec::sleep() +{ + verify_is_process("ThreadsExec::sleep", true ); + + if ( & execute_sleep == s_current_function ) return false ; + + fence(); + + ThreadsExec::global_lock(); + + s_current_function = & execute_sleep ; + + // Activate threads: + for ( unsigned i = s_thread_pool_size[0] ; 0 < i ; ) { + s_threads_exec[--i]->m_pool_state = ThreadsExec::Active ; + } + + return true ; +} + +bool ThreadsExec::wake() +{ + verify_is_process("ThreadsExec::wake", true ); + + if ( & execute_sleep != s_current_function ) return false ; + + ThreadsExec::global_unlock(); + + if ( s_threads_process.m_pool_base ) { + execute_sleep( s_threads_process , 0 ); + s_threads_process.m_pool_state = ThreadsExec::Inactive ; + } + + fence(); + + return true ; +} + +//---------------------------------------------------------------------------- + +void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) ) +{ + s_current_function = func ; + s_current_function_arg = & s_threads_process ; + + const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ; + + for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) { + ThreadsExec & th = * s_threads_exec[ --i ]; + + th.m_pool_state = ThreadsExec::Active ; + + wait_yield( th.m_pool_state , ThreadsExec::Active ); + } + + if ( s_threads_process.m_pool_base ) { + s_threads_process.m_pool_state = ThreadsExec::Active ; + (*func)( s_threads_process , 0 ); + s_threads_process.m_pool_state = ThreadsExec::Inactive ; + } + + s_current_function_arg = 0 ; + s_current_function = 0 ; +} + +//---------------------------------------------------------------------------- + +void * ThreadsExec::root_reduce_scratch() +{ + return s_threads_process.reduce_memory(); +} + +void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * ) +{ + exec.m_scratch.clear(); + + exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ; + exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ; + + if ( s_threads_process.m_scratch_thread_end ) { + + exec.m_scratch = + HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end ); + + unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() ); + unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned); + + // touch on this thread + while ( ptr < end ) *ptr++ = 0 ; + } +} + +void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size ) +{ + enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 }; + + fence(); + + const size_t old_reduce_size = s_threads_process.m_scratch_reduce_end ; + const size_t old_thread_size = s_threads_process.m_scratch_thread_end - s_threads_process.m_scratch_reduce_end ; + + reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ; + thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ; + + // Increase size or deallocate completely. + + if ( ( old_reduce_size < reduce_size ) || + ( old_thread_size < thread_size ) || + ( ( reduce_size == 0 && thread_size == 0 ) && + ( old_reduce_size != 0 || old_thread_size != 0 ) ) ) { + + verify_is_process( "ThreadsExec::resize_scratch" , true ); + + s_threads_process.m_scratch_reduce_end = reduce_size ; + s_threads_process.m_scratch_thread_end = reduce_size + thread_size ; + + execute_serial( & execute_resize_scratch ); + + s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ; + } + + return s_threads_process.m_scratch.alloc_ptr() ; +} + +//---------------------------------------------------------------------------- + +void ThreadsExec::print_configuration( std::ostream & s , const bool detail ) +{ + verify_is_process("ThreadsExec::print_configuration",false); + + fence(); + + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); + + // Forestall compiler warnings for unused variables. + (void) numa_count; + (void) cores_per_numa; + (void) threads_per_core; + + s << "Kokkos::Threads" ; + +#if defined( KOKKOS_HAVE_PTHREAD ) + s << " KOKKOS_HAVE_PTHREAD" ; +#endif +#if defined( KOKKOS_HAVE_HWLOC ) + s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ; +#endif + + if ( s_thread_pool_size[0] ) { + s << " threads[" << s_thread_pool_size[0] << "]" + << " threads_per_numa[" << s_thread_pool_size[1] << "]" + << " threads_per_core[" << s_thread_pool_size[2] << "]" + ; + if ( 0 == s_threads_process.m_pool_base ) { s << " Asynchronous" ; } + s << " ReduceScratch[" << s_current_reduce_size << "]" + << " SharedScratch[" << s_current_shared_size << "]" ; + s << std::endl ; + + if ( detail ) { + + for ( int i = 0 ; i < s_thread_pool_size[0] ; ++i ) { + + ThreadsExec * const th = s_threads_exec[i] ; + + if ( th ) { + + const int rank_rev = th->m_pool_size - ( th->m_pool_rank + 1 ); + + s << " Thread[ " << th->m_pool_rank << " : " + << th->m_numa_rank << "." << th->m_numa_core_rank << " ]" ; + + s << " Fan{" ; + for ( int j = 0 ; j < th->m_pool_fan_size ; ++j ) { + ThreadsExec * const thfan = th->m_pool_base[rank_rev+(1<<j)] ; + s << " [ " << thfan->m_pool_rank << " : " + << thfan->m_numa_rank << "." << thfan->m_numa_core_rank << " ]" ; + } + s << " }" ; + + if ( th == & s_threads_process ) { + s << " is_process" ; + } + } + s << std::endl ; + } + } + } + else { + s << " not initialized" << std::endl ; + } +} + +//---------------------------------------------------------------------------- + +int ThreadsExec::is_initialized() +{ return 0 != s_threads_exec[0] ; } + +void ThreadsExec::initialize( unsigned thread_count , + unsigned use_numa_count , + unsigned use_cores_per_numa , + bool allow_asynchronous_threadpool ) +{ + static const Sentinel sentinel ; + + const bool is_initialized = 0 != s_thread_pool_size[0] ; + + unsigned thread_spawn_failed = 0 ; + + for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++) + s_threads_exec[i] = NULL; + + if ( ! is_initialized ) { + + // If thread_count, use_numa_count, or use_cores_per_numa are zero + // then they will be given default values based upon hwloc detection + // and allowed asynchronous execution. + + const bool hwloc_avail = hwloc::available(); + + if ( thread_count == 0 ) { + thread_count = hwloc_avail + ? Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core() + : 1 ; + } + + const unsigned thread_spawn_begin = + hwloc::thread_mapping( "Kokkos::Threads::initialize" , + allow_asynchronous_threadpool , + thread_count , + use_numa_count , + use_cores_per_numa , + s_threads_coord ); + + const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ; + + if ( thread_spawn_begin ) { + // Synchronous with s_threads_coord[0] as the process core + // Claim entry #0 for binding the process core. + s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u); + } + + s_thread_pool_size[0] = thread_count ; + s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ; + s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ; + s_current_function = & execute_function_noop ; // Initialization work function + + for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { + + s_threads_process.m_pool_state = ThreadsExec::Inactive ; + + // If hwloc available then spawned thread will + // choose its own entry in 's_threads_coord' + // otherwise specify the entry. + s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith ); + + // Spawn thread executing the 'driver()' function. + // Wait until spawned thread has attempted to initialize. + // If spawning and initialization is successfull then + // an entry in 's_threads_exec' will be assigned. + if ( ThreadsExec::spawn() ) { + wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive ); + } + if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ; + } + + // Wait for all spawned threads to deactivate before zeroing the function. + + for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { + // Try to protect against cache coherency failure by casting to volatile. + ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ; + if ( th ) { + wait_yield( th->m_pool_state , ThreadsExec::Active ); + } + else { + ++thread_spawn_failed ; + } + } + + s_current_function = 0 ; + s_current_function_arg = 0 ; + s_threads_process.m_pool_state = ThreadsExec::Inactive ; + + if ( ! thread_spawn_failed ) { + // Bind process to the core on which it was located before spawning occured + Kokkos::hwloc::bind_this_thread( proc_coord ); + + if ( thread_spawn_begin ) { // Include process in pool. + const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate(); + + s_threads_exec[0] = & s_threads_process ; + s_threads_process.m_numa_rank = coord.first ; + s_threads_process.m_numa_core_rank = coord.second ; + s_threads_process.m_pool_base = s_threads_exec ; + s_threads_process.m_pool_rank = thread_count - 1 ; // Reversed for scan-compatible reductions + s_threads_process.m_pool_size = thread_count ; + s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size ); + s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self(); + } + else { + s_threads_process.m_pool_base = 0 ; + s_threads_process.m_pool_rank = 0 ; + s_threads_process.m_pool_size = 0 ; + s_threads_process.m_pool_fan_size = 0 ; + } + + // Initial allocations: + ThreadsExec::resize_scratch( 1024 , 1024 ); + } + else { + s_thread_pool_size[0] = 0 ; + s_thread_pool_size[1] = 0 ; + s_thread_pool_size[2] = 0 ; + } + } + + if ( is_initialized || thread_spawn_failed ) { + + std::ostringstream msg ; + + msg << "Kokkos::Threads::initialize ERROR" ; + + if ( is_initialized ) { + msg << " : already initialized" ; + } + if ( thread_spawn_failed ) { + msg << " : failed to spawn " << thread_spawn_failed << " threads" ; + } + + Kokkos::Impl::throw_runtime_exception( msg.str() ); + } + + // Init the array for used for arbitrarily sized atomics + Impl::init_lock_array_host_space(); + +} + +//---------------------------------------------------------------------------- + +void ThreadsExec::finalize() +{ + verify_is_process("ThreadsExec::finalize",false); + + fence(); + + resize_scratch(0,0); + + const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ; + + for ( unsigned i = s_thread_pool_size[0] ; begin < i-- ; ) { + + if ( s_threads_exec[i] ) { + + s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ; + + wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive ); + + s_threads_process.m_pool_state = ThreadsExec::Inactive ; + } + + s_threads_pid[i] = 0 ; + } + + if ( s_threads_process.m_pool_base ) { + ( & s_threads_process )->~ThreadsExec(); + s_threads_exec[0] = 0 ; + } + + Kokkos::hwloc::unbind_this_thread(); + + s_thread_pool_size[0] = 0 ; + s_thread_pool_size[1] = 0 ; + s_thread_pool_size[2] = 0 ; + + // Reset master thread to run solo. + s_threads_process.m_numa_rank = 0 ; + s_threads_process.m_numa_core_rank = 0 ; + s_threads_process.m_pool_base = 0 ; + s_threads_process.m_pool_rank = 0 ; + s_threads_process.m_pool_size = 1 ; + s_threads_process.m_pool_fan_size = 0 ; + s_threads_process.m_pool_state = ThreadsExec::Inactive ; +} + +//---------------------------------------------------------------------------- + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +Threads & Threads::instance(int) +{ + static Threads t ; + return t ; +} + +int Threads::thread_pool_size( int depth ) +{ + return Impl::s_thread_pool_size[depth]; +} + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) +int Threads::thread_pool_rank() +{ + const pthread_t pid = pthread_self(); + int i = 0; + while ( ( i < Impl::s_thread_pool_size[0] ) && ( pid != Impl::s_threads_pid[i] ) ) { ++i ; } + return i ; +} +#endif + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD ) */ + diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp new file mode 100755 index 0000000000000000000000000000000000000000..38206979770984ce69bdca68d09ccd8a1c0ab3bd --- /dev/null +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp @@ -0,0 +1,465 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADSEXEC_HPP +#define KOKKOS_THREADSEXEC_HPP + +#include <stdio.h> + +#include <utility> +#include <impl/Kokkos_spinwait.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> +#include <impl/Kokkos_AllocationTracker.hpp> + +#include <Kokkos_Atomic.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +class ThreadsExec { +public: + + // Fan array has log_2(NT) reduction threads plus 2 scan threads + // Currently limited to 16k threads. + enum { MAX_FAN_COUNT = 16 }; + enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) }; + enum { VECTOR_LENGTH = 8 }; + + /** \brief States of a worker thread */ + enum { Terminating ///< Termination in progress + , Inactive ///< Exists, waiting for work + , Active ///< Exists, performing work + , Rendezvous ///< Exists, waiting in a barrier or reduce + + , ScanCompleted + , ScanAvailable + , ReductionAvailable + }; + +private: + + friend class Kokkos::Threads ; + + // Fan-in operations' root is the highest ranking thread + // to place the 'scan' reduction intermediate values on + // the threads that need them. + // For a simple reduction the thread location is arbitrary. + + ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in + + Impl::AllocationTracker m_scratch ; + int m_scratch_reduce_end ; + int m_scratch_thread_end ; + int m_numa_rank ; + int m_numa_core_rank ; + int m_pool_rank ; + int m_pool_size ; + int m_pool_fan_size ; + int volatile m_pool_state ; ///< State for global synchronizations + + + static void global_lock(); + static void global_unlock(); + static bool spawn(); + + static void execute_resize_scratch( ThreadsExec & , const void * ); + static void execute_sleep( ThreadsExec & , const void * ); + + ThreadsExec( const ThreadsExec & ); + ThreadsExec & operator = ( const ThreadsExec & ); + + static void execute_serial( void (*)( ThreadsExec & , const void * ) ); + +public: + + KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size ; } + KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank ; } + KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank ; } + KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank ; } + + static int get_thread_count(); + static ThreadsExec * get_thread( const int init_thread_rank ); + + inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); } + KOKKOS_INLINE_FUNCTION void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; } + + KOKKOS_INLINE_FUNCTION int volatile & state() { return m_pool_state ; } + KOKKOS_INLINE_FUNCTION ThreadsExec * const * pool_base() const { return m_pool_base ; } + + static void driver(void); + + ~ThreadsExec(); + ThreadsExec(); + + static void * resize_scratch( size_t reduce_size , size_t thread_size ); + + static void * root_reduce_scratch(); + + static bool is_process(); + + static void verify_is_process( const std::string & , const bool initialized ); + + static int is_initialized(); + + static void initialize( unsigned thread_count , + unsigned use_numa_count , + unsigned use_cores_per_numa , + bool allow_asynchronous_threadpool ); + + static void finalize(); + + /* Given a requested team size, return valid team size */ + static unsigned team_size_valid( unsigned ); + + static void print_configuration( std::ostream & , const bool detail = false ); + + //------------------------------------ + + static void wait_yield( volatile int & , const int ); + + //------------------------------------ + // All-thread functions: + + inline + int all_reduce( const int value ) + { + // Make sure there is enough scratch space: + const int rev_rank = m_pool_size - ( m_pool_rank + 1 ); + + *((volatile int*) reduce_memory()) = value ; + + memory_fence(); + + // Fan-in reduction with highest ranking thread as the root + for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { + // Wait: Active -> Rendezvous + Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active ); + } + + if ( rev_rank ) { + m_pool_state = ThreadsExec::Rendezvous ; + // Wait: Rendezvous -> Active + Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous ); + } + else { + // Root thread does the reduction and broadcast + + int accum = 0 ; + + for ( int rank = 0 ; rank < m_pool_size ; ++rank ) { + accum += *((volatile int *) get_thread( rank )->reduce_memory()); + } + + for ( int rank = 0 ; rank < m_pool_size ; ++rank ) { + *((volatile int *) get_thread( rank )->reduce_memory()) = accum ; + } + + memory_fence(); + + for ( int rank = 0 ; rank < m_pool_size ; ++rank ) { + get_thread( rank )->m_pool_state = ThreadsExec::Active ; + } + } + + return *((volatile int*) reduce_memory()); + } + + //------------------------------------ + // All-thread functions: + + template< class FunctorType , class ArgTag > + inline + void fan_in_reduce( const FunctorType & f ) const + { + typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ; + typedef Kokkos::Impl::FunctorFinal< FunctorType , ArgTag > Final ; + + const int rev_rank = m_pool_size - ( m_pool_rank + 1 ); + + for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { + + ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ; + + Impl::spinwait( fan.m_pool_state , ThreadsExec::Active ); + + Join::join( f , reduce_memory() , fan.reduce_memory() ); + } + + if ( ! rev_rank ) { + Final::final( f , reduce_memory() ); + } + } + + inline + void fan_in() const + { + const int rev_rank = m_pool_size - ( m_pool_rank + 1 ); + + for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { + Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active ); + } + } + + template< class FunctorType , class ArgTag > + inline + void scan_large( const FunctorType & f ) + { + // Sequence of states: + // 0) Active : entry and exit state + // 1) ReductionAvailable : reduction value available + // 2) ScanAvailable : inclusive scan value available + // 3) Rendezvous : All threads inclusive scan value are available + // 4) ScanCompleted : exclusive scan value copied + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ; + typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , ArgTag > Init ; + + typedef typename Traits::value_type scalar_type ; + + const int rev_rank = m_pool_size - ( m_pool_rank + 1 ); + const unsigned count = Traits::value_count( f ); + + scalar_type * const work_value = (scalar_type *) reduce_memory(); + + //-------------------------------- + // Fan-in reduction with highest ranking thread as the root + for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { + ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ]; + + // Wait: Active -> ReductionAvailable (or ScanAvailable) + Impl::spinwait( fan.m_pool_state , ThreadsExec::Active ); + Join::join( f , work_value , fan.reduce_memory() ); + } + + // Copy reduction value to scan value before releasing from this phase. + for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i] ; } + + if ( rev_rank ) { + + // Set: Active -> ReductionAvailable + m_pool_state = ThreadsExec::ReductionAvailable ; + + // Wait for contributing threads' scan value to be available. + if ( ( 1 << m_pool_fan_size ) < ( m_pool_rank + 1 ) ) { + ThreadsExec & th = *m_pool_base[ rev_rank + ( 1 << m_pool_fan_size ) ] ; + + // Wait: Active -> ReductionAvailable + // Wait: ReductionAvailable -> ScanAvailable + Impl::spinwait( th.m_pool_state , ThreadsExec::Active ); + Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable ); + + Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count ); + } + + // This thread has completed inclusive scan + // Set: ReductionAvailable -> ScanAvailable + m_pool_state = ThreadsExec::ScanAvailable ; + + // Wait for all threads to complete inclusive scan + // Wait: ScanAvailable -> Rendezvous + Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable ); + } + + //-------------------------------- + + for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { + ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ]; + // Wait: ReductionAvailable -> ScanAvailable + Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable ); + // Set: ScanAvailable -> Rendezvous + fan.m_pool_state = ThreadsExec::Rendezvous ; + } + + // All threads have completed the inclusive scan. + // All non-root threads are in the Rendezvous state. + // Threads are free to overwrite their reduction value. + //-------------------------------- + + if ( ( rev_rank + 1 ) < m_pool_size ) { + // Exclusive scan: copy the previous thread's inclusive scan value + + ThreadsExec & th = *m_pool_base[ rev_rank + 1 ] ; // Not the root thread + + const scalar_type * const src_value = ((scalar_type *)th.reduce_memory()) + count ; + + for ( unsigned j = 0 ; j < count ; ++j ) { work_value[j] = src_value[j]; } + } + else { + (void) Init::init( f , work_value ); + } + + //-------------------------------- + // Wait for all threads to copy previous thread's inclusive scan value + // Wait for all threads: Rendezvous -> ScanCompleted + for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { + Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous ); + } + if ( rev_rank ) { + // Set: ScanAvailable -> ScanCompleted + m_pool_state = ThreadsExec::ScanCompleted ; + // Wait: ScanCompleted -> Active + Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted ); + } + // Set: ScanCompleted -> Active + for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { + m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ; + } + } + + template< class FunctorType , class ArgTag > + inline + void scan_small( const FunctorType & f ) + { + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ; + typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , ArgTag > Init ; + + typedef typename Traits::value_type scalar_type ; + + const int rev_rank = m_pool_size - ( m_pool_rank + 1 ); + const unsigned count = Traits::value_count( f ); + + scalar_type * const work_value = (scalar_type *) reduce_memory(); + + //-------------------------------- + // Fan-in reduction with highest ranking thread as the root + for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { + // Wait: Active -> Rendezvous + Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active ); + } + + for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; } + + if ( rev_rank ) { + m_pool_state = ThreadsExec::Rendezvous ; + // Wait: Rendezvous -> Active + Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous ); + } + else { + // Root thread does the thread-scan before releasing threads + + scalar_type * ptr_prev = 0 ; + + for ( int rank = 0 ; rank < m_pool_size ; ++rank ) { + scalar_type * const ptr = (scalar_type *) get_thread( rank )->reduce_memory(); + if ( rank ) { + for ( unsigned i = 0 ; i < count ; ++i ) { ptr[i] = ptr_prev[ i + count ]; } + Join::join( f , ptr + count , ptr ); + } + else { + (void) Init::init( f , ptr ); + } + ptr_prev = ptr ; + } + } + + for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { + m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ; + } + } + + //------------------------------------ + /** \brief Wait for previous asynchronous functor to + * complete and release the Threads device. + * Acquire the Threads device and start this functor. + */ + static void start( void (*)( ThreadsExec & , const void * ) , const void * ); + + static int in_parallel(); + static void fence(); + static bool sleep(); + static bool wake(); +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +inline int Threads::in_parallel() +{ return Impl::ThreadsExec::in_parallel(); } + +inline int Threads::is_initialized() +{ return Impl::ThreadsExec::is_initialized(); } + +inline void Threads::initialize( + unsigned threads_count , + unsigned use_numa_count , + unsigned use_cores_per_numa , + bool allow_asynchronous_threadpool ) +{ + Impl::ThreadsExec::initialize( threads_count , use_numa_count , use_cores_per_numa , allow_asynchronous_threadpool ); +} + +inline void Threads::finalize() +{ + Impl::ThreadsExec::finalize(); +} + +inline void Threads::print_configuration( std::ostream & s , const bool detail ) +{ + Impl::ThreadsExec::print_configuration( s , detail ); +} + +inline bool Threads::sleep() +{ return Impl::ThreadsExec::sleep() ; } + +inline bool Threads::wake() +{ return Impl::ThreadsExec::wake() ; } + +inline void Threads::fence() +{ Impl::ThreadsExec::fence() ; } + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #define KOKKOS_THREADSEXEC_HPP */ + diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp new file mode 100755 index 0000000000000000000000000000000000000000..40d5efd0fe21e5db54bee49ac98e9bc1af1b12bd --- /dev/null +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp @@ -0,0 +1,254 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core_fwd.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_HAVE_PTHREAD ) + +/* Standard 'C' Linux libraries */ + +#include <pthread.h> +#include <sched.h> +#include <errno.h> + +/* Standard C++ libaries */ + +#include <cstdlib> +#include <string> +#include <iostream> +#include <stdexcept> + +#include <Kokkos_Threads.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { +namespace { + +pthread_mutex_t host_internal_pthread_mutex = PTHREAD_MUTEX_INITIALIZER ; + +// Pthreads compatible driver. +// Recovery from an exception would require constant intra-thread health +// verification; which would negatively impact runtime. As such simply +// abort the process. + +void * internal_pthread_driver( void * ) +{ + try { + ThreadsExec::driver(); + } + catch( const std::exception & x ) { + std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl ; + std::cerr.flush(); + std::abort(); + } + catch( ... ) { + std::cerr << "Exception thrown from worker thread" << std::endl ; + std::cerr.flush(); + std::abort(); + } + return NULL ; +} + +} // namespace + +//---------------------------------------------------------------------------- +// Spawn a thread + +bool ThreadsExec::spawn() +{ + bool result = false ; + + pthread_attr_t attr ; + + if ( 0 == pthread_attr_init( & attr ) || + 0 == pthread_attr_setscope( & attr, PTHREAD_SCOPE_SYSTEM ) || + 0 == pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) { + + pthread_t pt ; + + result = 0 == pthread_create( & pt, & attr, internal_pthread_driver, 0 ); + } + + pthread_attr_destroy( & attr ); + + return result ; +} + +//---------------------------------------------------------------------------- + +bool ThreadsExec::is_process() +{ + static const pthread_t master_pid = pthread_self(); + + return pthread_equal( master_pid , pthread_self() ); +} + +void ThreadsExec::global_lock() +{ + pthread_mutex_lock( & host_internal_pthread_mutex ); +} + +void ThreadsExec::global_unlock() +{ + pthread_mutex_unlock( & host_internal_pthread_mutex ); +} + +//---------------------------------------------------------------------------- + +void ThreadsExec::wait_yield( volatile int & flag , const int value ) +{ + while ( value == flag ) { sched_yield(); } +} + +} // namespace Impl +} // namespace Kokkos + +/* end #if defined( KOKKOS_HAVE_PTHREAD ) */ +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#elif defined( KOKKOS_HAVE_WINTHREAD ) + +/* Windows libraries */ +#include <windows.h> +#include <process.h> + +/* Standard C++ libaries */ + +#include <cstdlib> +#include <string> +#include <iostream> +#include <stdexcept> + +#include <Kokkos_Threads.hpp> + +//---------------------------------------------------------------------------- +// Driver for each created pthread + +namespace Kokkos { +namespace Impl { +namespace { + +unsigned WINAPI internal_winthread_driver( void * arg ) +{ + ThreadsExec::driver(); + + return 0 ; +} + +class ThreadLockWindows { +private: + CRITICAL_SECTION m_handle ; + + ~ThreadLockWindows() + { DeleteCriticalSection( & m_handle ); } + + ThreadLockWindows(); + { InitializeCriticalSection( & m_handle ); } + + ThreadLockWindows( const ThreadLockWindows & ); + ThreadLockWindows & operator = ( const ThreadLockWindows & ); + +public: + + static ThreadLockWindows & singleton(); + + void lock() + { EnterCriticalSection( & m_handle ); } + + void unlock() + { LeaveCriticalSection( & m_handle ); } +}; + +ThreadLockWindows & ThreadLockWindows::singleton() +{ static ThreadLockWindows self ; return self ; } + +} // namespace <> +} // namespace Kokkos +} // namespace Impl + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// Spawn this thread + +bool ThreadsExec::spawn() +{ + unsigned Win32ThreadID = 0 ; + + HANDLE handle = + _beginthreadex(0,0,internal_winthread_driver,0,0, & Win32ThreadID ); + + return ! handle ; +} + +bool ThreadsExec::is_process() { return true ; } + +void ThreadsExec::global_lock() +{ ThreadLockWindows::singleton().lock(); } + +void ThreadsExec::global_unlock() +{ ThreadLockWindows::singleton().unlock(); } + +void ThreadsExec::wait_yield( volatile int & flag , const int value ) {} +{ + while ( value == flag ) { Sleep(0); } +} + +} // namespace Impl +} // namespace Kokkos + +#endif /* end #elif defined( KOKKOS_HAVE_WINTHREAD ) */ +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + + + diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp new file mode 100755 index 0000000000000000000000000000000000000000..53b5eb01dff4f745ef3e8486394dceda96457638 --- /dev/null +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp @@ -0,0 +1,730 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADSTEAM_HPP +#define KOKKOS_THREADSTEAM_HPP + +#include <stdio.h> + +#include <utility> +#include <impl/Kokkos_spinwait.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> + +#include <Kokkos_Atomic.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- + +template< class > struct ThreadsExecAdapter ; + +//---------------------------------------------------------------------------- + +class ThreadsExecTeamMember { +private: + + enum { TEAM_REDUCE_SIZE = 512 }; + + typedef Kokkos::Threads execution_space ; + typedef execution_space::scratch_memory_space space ; + + ThreadsExec * const m_exec ; + ThreadsExec * const * m_team_base ; ///< Base for team fan-in + space m_team_shared ; + int m_team_shared_size ; + int m_team_size ; + int m_team_rank ; + int m_team_rank_rev ; + int m_league_size ; + int m_league_end ; + int m_league_rank ; + + inline + void set_team_shared() + { new( & m_team_shared ) space( ((char *) (*m_team_base)->scratch_memory()) + TEAM_REDUCE_SIZE , m_team_shared_size ); } + +public: + + // Fan-in and wait until the matching fan-out is called. + // The root thread which does not wait will return true. + // All other threads will return false during the fan-out. + KOKKOS_INLINE_FUNCTION bool team_fan_in() const + { + int n , j ; + + // Wait for fan-in threads + for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) { + Impl::spinwait( m_team_base[j]->state() , ThreadsExec::Active ); + } + + // If not root then wait for release + if ( m_team_rank_rev ) { + m_exec->state() = ThreadsExec::Rendezvous ; + Impl::spinwait( m_exec->state() , ThreadsExec::Rendezvous ); + } + + return ! m_team_rank_rev ; + } + + KOKKOS_INLINE_FUNCTION void team_fan_out() const + { + int n , j ; + for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) { + m_team_base[j]->state() = ThreadsExec::Active ; + } + } + +public: + + KOKKOS_INLINE_FUNCTION static int team_reduce_size() { return TEAM_REDUCE_SIZE ; } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space & team_shmem() const + { return m_team_shared ; } + + KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; } + KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; } + KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; } + KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; } + + KOKKOS_INLINE_FUNCTION void team_barrier() const + { + team_fan_in(); + team_fan_out(); + } + + template<class ValueType> + KOKKOS_INLINE_FUNCTION + void team_broadcast(ValueType& value, const int& thread_id) const + { +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { } +#else + // Make sure there is enough scratch space: + typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE + , ValueType , void >::type type ; + + if ( m_team_base ) { + type * const local_value = ((type*) m_team_base[0]->scratch_memory()); + if(team_rank() == thread_id) *local_value = value; + memory_fence(); + team_barrier(); + value = *local_value; + } +#endif + } + + template< typename Type > + KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return Type(); } +#else + { + // Make sure there is enough scratch space: + typedef typename if_c< sizeof(Type) < TEAM_REDUCE_SIZE , Type , void >::type type ; + + if ( 0 == m_exec ) return value ; + + *((volatile type*) m_exec->scratch_memory() ) = value ; + + memory_fence(); + + type & accum = *((type *) m_team_base[0]->scratch_memory() ); + + if ( team_fan_in() ) { + for ( int i = 1 ; i < m_team_size ; ++i ) { + accum += *((type *) m_team_base[i]->scratch_memory() ); + } + memory_fence(); + } + + team_fan_out(); + + return accum ; + } +#endif + +#ifdef KOKKOS_HAVE_CXX11 + template< class ValueType, class JoinOp > + KOKKOS_INLINE_FUNCTION ValueType + team_reduce( const ValueType & value + , const JoinOp & op_in ) const + #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return ValueType(); } + #else + { + typedef ValueType value_type; + const JoinLambdaAdapter<value_type,JoinOp> op(op_in); + #endif +#else // KOKKOS_HAVE_CXX11 + template< class JoinOp > + KOKKOS_INLINE_FUNCTION typename JoinOp::value_type + team_reduce( const typename JoinOp::value_type & value + , const JoinOp & op ) const + #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return typename JoinOp::value_type(); } + #else + { + typedef typename JoinOp::value_type value_type; + #endif +#endif // KOKKOS_HAVE_CXX11 +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + // Make sure there is enough scratch space: + typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE + , value_type , void >::type type ; + + if ( 0 == m_exec ) return value ; + + type * const local_value = ((type*) m_exec->scratch_memory()); + + // Set this thread's contribution + *local_value = value ; + + // Fence to make sure the base team member has access: + memory_fence(); + + if ( team_fan_in() ) { + // The last thread to synchronize returns true, all other threads wait for team_fan_out() + type * const team_value = ((type*) m_team_base[0]->scratch_memory()); + + // Join to the team value: + for ( int i = 1 ; i < m_team_size ; ++i ) { + op.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) ); + } + + // Team base thread may "lap" member threads so copy out to their local value. + for ( int i = 1 ; i < m_team_size ; ++i ) { + *((type*) m_team_base[i]->scratch_memory()) = *team_value ; + } + + // Fence to make sure all team members have access + memory_fence(); + } + + team_fan_out(); + + // Value was changed by the team base + return *((type volatile const *) local_value); + } +#endif + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template< typename ArgType > + KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const +#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return ArgType(); } +#else + { + // Make sure there is enough scratch space: + typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ; + + if ( 0 == m_exec ) return type(0); + + volatile type * const work_value = ((type*) m_exec->scratch_memory()); + + *work_value = value ; + + memory_fence(); + + if ( team_fan_in() ) { + // The last thread to synchronize returns true, all other threads wait for team_fan_out() + // m_team_base[0] == highest ranking team member + // m_team_base[ m_team_size - 1 ] == lowest ranking team member + // + // 1) copy from lower to higher rank, initialize lowest rank to zero + // 2) prefix sum from lowest to highest rank, skipping lowest rank + + type accum = 0 ; + + if ( global_accum ) { + for ( int i = m_team_size ; i-- ; ) { + type & val = *((type*) m_team_base[i]->scratch_memory()); + accum += val ; + } + accum = atomic_fetch_add( global_accum , accum ); + } + + for ( int i = m_team_size ; i-- ; ) { + type & val = *((type*) m_team_base[i]->scratch_memory()); + const type offset = accum ; + accum += val ; + val = offset ; + } + + memory_fence(); + } + + team_fan_out(); + + return *work_value ; + } +#endif + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template< typename ArgType > + KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value ) const + { return this-> template team_scan<ArgType>( value , 0 ); } + + + //---------------------------------------- + // Private for the driver + + template< class Arg0 , class Arg1 > + ThreadsExecTeamMember( Impl::ThreadsExec * exec + , const TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > & team + , const int shared_size ) + : m_exec( exec ) + , m_team_base(0) + , m_team_shared(0,0) + , m_team_shared_size( shared_size ) + , m_team_size(0) + , m_team_rank(0) + , m_team_rank_rev(0) + , m_league_size(0) + , m_league_end(0) + , m_league_rank(0) + { + if ( team.league_size() ) { + // Execution is using device-team interface: + + const int pool_rank_rev = m_exec->pool_size() - ( m_exec->pool_rank() + 1 ); + const int team_rank_rev = pool_rank_rev % team.team_alloc(); + + // May be using fewer threads per team than a multiple of threads per core, + // some threads will idle. + + if ( team_rank_rev < team.team_size() ) { + const size_t pool_league_size = m_exec->pool_size() / team.team_alloc() ; + const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ; + const size_t pool_league_rank = pool_league_size - ( pool_league_rank_rev + 1 ); + + m_team_base = m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev ; + m_team_size = team.team_size() ; + m_team_rank = team.team_size() - ( team_rank_rev + 1 ); + m_team_rank_rev = team_rank_rev ; + m_league_size = team.league_size(); + + m_league_rank = ( team.league_size() * pool_league_rank ) / pool_league_size ; + m_league_end = ( team.league_size() * (pool_league_rank+1) ) / pool_league_size ; + + set_team_shared(); + } + } + } + + ThreadsExecTeamMember() + : m_exec(0) + , m_team_base(0) + , m_team_shared(0,0) + , m_team_shared_size(0) + , m_team_size(1) + , m_team_rank(0) + , m_team_rank_rev(0) + , m_league_size(1) + , m_league_end(0) + , m_league_rank(0) + {} + + inline + ThreadsExec & threads_exec_team_base() const { return m_team_base ? **m_team_base : *m_exec ; } + + bool valid() const + { return m_league_rank < m_league_end ; } + + void next() + { + if ( ++m_league_rank < m_league_end ) { + team_barrier(); + set_team_shared(); + } + } + + void set_league_shmem( const int arg_league_rank + , const int arg_league_size + , const int arg_shmem_size + ) + { + m_league_rank = arg_league_rank ; + m_league_size = arg_league_size ; + m_team_shared_size = arg_shmem_size ; + set_team_shared(); + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template< class Arg0 , class Arg1 > +class TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > +{ +private: + + int m_league_size ; + int m_team_size ; + int m_team_alloc ; + + inline + void init( const int league_size_request + , const int team_size_request ) + { + const int pool_size = execution_space::thread_pool_size(0); + const int team_max = execution_space::thread_pool_size(1); + const int team_grain = execution_space::thread_pool_size(2); + + m_league_size = league_size_request ; + + m_team_size = team_size_request < team_max ? + team_size_request : team_max ; + + // Round team size up to a multiple of 'team_gain' + const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain ); + const int team_count = pool_size / team_size_grain ; + + // Constraint : pool_size = m_team_alloc * team_count + m_team_alloc = pool_size / team_count ; + } + + +public: + + //! Tag this class as a kokkos execution policy + typedef TeamPolicy execution_policy ; + typedef Kokkos::Threads execution_space ; + + typedef typename + Impl::if_c< ! Impl::is_same< Kokkos::Threads , Arg0 >::value , Arg0 , Arg1 >::type + work_tag ; + + //---------------------------------------- + + template< class FunctorType > + inline static + int team_size_max( const FunctorType & ) + { return execution_space::thread_pool_size(1); } + + template< class FunctorType > + static int team_size_recommended( const FunctorType & ) + { return execution_space::thread_pool_size(2); } + + + template< class FunctorType > + inline static + int team_size_recommended( const FunctorType &, const int& ) + { return execution_space::thread_pool_size(2); } + + //---------------------------------------- + + inline int team_size() const { return m_team_size ; } + inline int team_alloc() const { return m_team_alloc ; } + inline int league_size() const { return m_league_size ; } + + /** \brief Specify league size, request team size */ + TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 ) + : m_league_size(0) + , m_team_size(0) + , m_team_alloc(0) + { init(league_size_request,team_size_request); (void) vector_length_request; } + + TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 ) + : m_league_size(0) + , m_team_size(0) + , m_team_alloc(0) + { init(league_size_request,team_size_request); (void) vector_length_request; } + + typedef Impl::ThreadsExecTeamMember member_type ; + + friend class Impl::ThreadsExecTeamMember ; +}; + + +} /* namespace Kokkos */ + + +namespace Kokkos { + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember> +TeamThreadRange(const Impl::ThreadsExecTeamMember& thread, const iType& count) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>(thread,count); +} + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember> +TeamThreadRange( const Impl::ThreadsExecTeamMember& thread + , const iType & begin + , const iType & end + ) +{ + return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>(thread,begin,end); +} + + +template<typename iType> +KOKKOS_INLINE_FUNCTION +Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember > + ThreadVectorRange(const Impl::ThreadsExecTeamMember& thread, const iType& count) { + return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >(thread,count); +} + + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember> PerTeam(const Impl::ThreadsExecTeamMember& thread) { + return Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember> PerThread(const Impl::ThreadsExecTeamMember& thread) { + return Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>(thread); +} +} // namespace Kokkos + +namespace Kokkos { + + /** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + * This functionality requires C++11 support.*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda) { + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i); +} + +/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of + * val is performed and put into result. This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries, + const Lambda & lambda, ValueType& result) { + + result = ValueType(); + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + result+=tmp; + } + + result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>()); +} + +#if defined( KOKKOS_HAVE_CXX11 ) + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of + * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result. + * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore + * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or + * '1 for *'). This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries, + const Lambda & lambda, const JoinType& join, ValueType& init_result) { + + ValueType result = init_result; + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + join(result,tmp); + } + + init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join)); +} + +#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */ + +} //namespace Kokkos + + +namespace Kokkos { +/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. + * This functionality requires C++11 support.*/ +template<typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION +void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >& + loop_boundaries, const Lambda& lambda) { + #ifdef KOKKOS_HAVE_PRAGMA_IVDEP + #pragma ivdep + #endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) + lambda(i); +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of + * val is performed and put into result. This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >& + loop_boundaries, const Lambda & lambda, ValueType& result) { + result = ValueType(); +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + result+=tmp; + } +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of + * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result. + * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore + * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or + * '1 for *'). This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >& + loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) { + + ValueType result = init_result; +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i,tmp); + join(result,tmp); + } + init_result = result; +} + +/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final) + * for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed. + * Depending on the target execution space the operator might be called twice: once with final=false + * and once with final=true. When final==true val contains the prefix sum value. The contribution of this + * "i" needs to be added to val no matter whether final==true or not. In a serial execution + * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set + * to the final sum value over all vector lanes. + * This functionality requires C++11 support.*/ +template< typename iType, class FunctorType > +KOKKOS_INLINE_FUNCTION +void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >& + loop_boundaries, const FunctorType & lambda) { + + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ; + typedef typename ValueTraits::value_type value_type ; + + value_type scan_val = value_type(); + +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP +#pragma ivdep +#endif + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,scan_val,true); + } +} + +} // namespace Kokkos + +namespace Kokkos { + +template<class FunctorType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) { + lambda(); +} + +template<class FunctorType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) { + if(single_struct.team_member.team_rank()==0) lambda(); +} + +template<class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) { + lambda(val); +} + +template<class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION +void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) { + if(single_struct.team_member.team_rank()==0) { + lambda(val); + } + single_struct.team_member.team_broadcast(val,0); +} +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #define KOKKOS_THREADSTEAM_HPP */ + diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp new file mode 100755 index 0000000000000000000000000000000000000000..4b2a16912693abfac48ffe87d04f4a4c1c9aa885 --- /dev/null +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp @@ -0,0 +1,427 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADS_PARALLEL_HPP +#define KOKKOS_THREADS_PARALLEL_HPP + +#include <vector> + +#include <Kokkos_Parallel.hpp> + +#include <impl/Kokkos_StaticAssert.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ; + + const FunctorType m_func ; + const Policy m_policy ; + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( i ); + } + } + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( ! Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( typename PType::work_tag() , i ); + } + } + + static void execute( ThreadsExec & exec , const void * arg ) + { + const ParallelFor & self = * ((const ParallelFor *) arg ); + + driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() ) ); + + exec.fan_in(); + } + +public: + + ParallelFor( const FunctorType & functor + , const Policy & policy ) + : m_func( functor ) + , m_policy( policy ) + { + ThreadsExec::start( & ParallelFor::execute , this ); + + ThreadsExec::fence(); + } +}; + +template< class FunctorType , class Arg0 , class Arg1 > +class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > > +{ +private: + + typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > Policy ; + + const FunctorType m_func ; + const Policy m_policy ; + const int m_shared ; + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION + void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member ) const + { m_func( member ); } + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION + void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member ) const + { m_func( TagType() , member ); } + + static void execute( ThreadsExec & exec , const void * arg ) + { + const ParallelFor & self = * ((const ParallelFor *) arg ); + + typename Policy::member_type member( & exec , self.m_policy , self.m_shared ); + + for ( ; member.valid() ; member.next() ) { + self.ParallelFor::template driver< typename Policy::work_tag >( member ); + } + + exec.fan_in(); + } + +public: + + ParallelFor( const FunctorType & functor + , const Policy & policy ) + : m_func( functor ) + , m_policy( policy ) + , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) ) + { + ThreadsExec::resize_scratch( 0 , Policy::member_type::team_reduce_size() + m_shared ); + + ThreadsExec::start( & ParallelFor::execute , this ); + + ThreadsExec::fence(); + } +}; + + + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ; + typedef typename Policy::work_tag work_tag ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + const FunctorType m_func ; + const Policy m_policy ; + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , reference_type update + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( i , update ); + } + } + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( ! Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , reference_type update + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( typename PType::work_tag() , i , update ); + } + } + + static void execute( ThreadsExec & exec , const void * arg ) + { + const ParallelReduce & self = * ((const ParallelReduce *) arg ); + + driver( self.m_func + , ValueInit::init( self.m_func , exec.reduce_memory() ) + , typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() ) + ); + + exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func ); + } + +public: + + template< class HostViewType > + ParallelReduce( const FunctorType & functor , + const Policy & policy , + const HostViewType & result_view ) + : m_func( functor ) + , m_policy( policy ) + { + ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , 0 ); + + ThreadsExec::start( & ParallelReduce::execute , this ); + + const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch(); + + ThreadsExec::fence(); + + if ( result_view.ptr_on_device() ) { + const unsigned n = ValueTraits::value_count( m_func ); + for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; } + } + } +}; + +//---------------------------------------------------------------------------- + +template< class FunctorType , class Arg0 , class Arg1 > +class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > > +{ +private: + + typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > Policy ; + typedef typename Policy::work_tag work_tag ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + const FunctorType m_func ; + const Policy m_policy ; + const int m_shared ; + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION + void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member + , reference_type update ) const + { m_func( member , update ); } + + template< class TagType > + KOKKOS_FORCEINLINE_FUNCTION + void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value , + const typename Policy::member_type & >::type member + , reference_type update ) const + { m_func( TagType() , member , update ); } + + static void execute( ThreadsExec & exec , const void * arg ) + { + const ParallelReduce & self = * ((const ParallelReduce *) arg ); + + // Initialize thread-local value + reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() ); + + typename Policy::member_type member( & exec , self.m_policy , self.m_shared ); + for ( ; member.valid() ; member.next() ) { + self.ParallelReduce::template driver< work_tag >( member , update ); + } + + exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func ); + } + +public: + + ParallelReduce( const FunctorType & functor + , const Policy & policy ) + : m_func( functor ) + , m_policy( policy ) + , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) ) + { + ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared ); + + ThreadsExec::start( & ParallelReduce::execute , this ); + + ThreadsExec::fence(); + } + + template< class ViewType > + ParallelReduce( const FunctorType & functor + , const Policy & policy + , const ViewType & result ) + : m_func( functor ) + , m_policy( policy ) + , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) ) + { + ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared ); + + ThreadsExec::start( & ParallelReduce::execute , this ); + + const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch(); + + ThreadsExec::fence(); + + const unsigned n = ValueTraits::value_count( m_func ); + for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; } + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template< class FunctorType , class Arg0 , class Arg1 , class Arg2 > +class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > > +{ +private: + + typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ; + typedef typename Policy::work_tag work_tag ; + typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ; + + typedef typename ValueTraits::pointer_type pointer_type ; + typedef typename ValueTraits::reference_type reference_type ; + + const FunctorType m_func ; + const Policy m_policy ; + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , reference_type update + , const bool final + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( i , update , final ); + } + } + + template< class PType > + KOKKOS_FORCEINLINE_FUNCTION static + void driver( typename Impl::enable_if< + ( ! Impl::is_same< typename PType::work_tag , void >::value ) + , const FunctorType & >::type functor + , reference_type update + , const bool final + , const PType & range ) + { + const typename PType::member_type e = range.end(); + for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) { + functor( typename PType::work_tag() , i , update , final ); + } + } + + static void execute( ThreadsExec & exec , const void * arg ) + { + const ParallelScan & self = * ((const ParallelScan *) arg ); + + const typename Policy::WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() ); + + reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() ); + + driver( self.m_func , update , false , range ); + + // exec.<FunctorType,work_tag>scan_large( self.m_func ); + exec.template scan_small<FunctorType,work_tag>( self.m_func ); + + driver( self.m_func , update , true , range ); + + exec.fan_in(); + } + +public: + + ParallelScan( const FunctorType & functor , const Policy & policy ) + : m_func( functor ) + , m_policy( policy ) + { + ThreadsExec::resize_scratch( 2 * ValueTraits::value_size( m_func ) , 0 ); + ThreadsExec::start( & ParallelScan::execute , this ); + ThreadsExec::fence(); + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */ + diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp new file mode 100755 index 0000000000000000000000000000000000000000..8ad7f15ecc2f9c0b6c623088d3fd341dc29c0c03 --- /dev/null +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp @@ -0,0 +1,599 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#include <stdio.h> +#include <iostream> +#include <sstream> +#include <Threads/Kokkos_Threads_TaskPolicy.hpp> + +#if defined( KOKKOS_HAVE_PTHREAD ) + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +typedef TaskMember< Kokkos::Threads , void , void > Task ; + +namespace { + +int volatile s_count_serial = 0 ; +int volatile s_count_team = 0 ; +Task * volatile s_ready_team = 0 ; +Task * volatile s_ready_serial = 0 ; +Task * const s_lock = reinterpret_cast<Task*>( ~((unsigned long)0) ); +Task * const s_denied = reinterpret_cast<Task*>( ~((unsigned long)0) - 1 ); + +} /* namespace */ +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + +namespace Kokkos { +namespace Experimental { + +TaskPolicy< Kokkos::Threads >::TaskPolicy + ( const unsigned arg_default_dependence_capacity + , const unsigned arg_team_size + ) + : m_default_dependence_capacity( arg_default_dependence_capacity ) + , m_team_size( arg_team_size ) +{ + const int threads_total = Threads::thread_pool_size(0); + const int threads_per_numa = Threads::thread_pool_size(1); + const int threads_per_core = Threads::thread_pool_size(2); + + if ( 0 == arg_team_size ) { + // If a team task then claim for execution until count is zero + // Issue: team collectives cannot assume which pool members are in the team. + // Issue: team must only span a single NUMA region. + + // If more than one thread per core then map cores to work team, + // else map numa to work team. + + if ( 1 < threads_per_core ) m_team_size = threads_per_core ; + else if ( 1 < threads_per_numa ) m_team_size = threads_per_numa ; + else m_team_size = 1 ; + } + + // Verify a valid team size + const bool valid_team_size = + ( 0 < m_team_size && m_team_size <= threads_total ) && + ( + ( 1 == m_team_size ) || + ( threads_per_core == m_team_size ) || + ( threads_per_numa == m_team_size ) + ); + + if ( ! valid_team_size ) { + std::ostringstream msg ; + + msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Threads > ERROR" + << " invalid team_size(" << m_team_size << ")" + << " threads_per_core(" << threads_per_core << ")" + << " threads_per_numa(" << threads_per_numa << ")" + << " threads_total(" << threads_total << ")" + ; + + Kokkos::Impl::throw_runtime_exception( msg.str() ); + + } +} + +TaskPolicy< Kokkos::Threads >::member_type & +TaskPolicy< Kokkos::Threads >::member_single() +{ + static member_type s ; + return s ; +} + +void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Threads > & policy ) +{ + typedef Kokkos::Impl::ThreadsExecTeamMember member_type ; + + enum { BASE_SHMEM = 1024 }; + + void * const arg = reinterpret_cast<void*>( long( policy.m_team_size ) ); + + Kokkos::Impl::ThreadsExec::resize_scratch( 0 , member_type::team_reduce_size() + BASE_SHMEM ); + Kokkos::Impl::ThreadsExec::start( & Impl::Task::execute_ready_tasks_driver , arg ); + Kokkos::Impl::ThreadsExec::fence(); +} + +} /* namespace Experimental */ +} /* namespace Kokkos */ + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +//---------------------------------------------------------------------------- + +void Task::throw_error_verify_type() +{ + Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::verify_type ERROR"); +} + +void Task::deallocate( void * ptr ) +{ + free( ptr ); +} + +void * Task::allocate( const unsigned n ) +{ + void * const ptr = malloc(n); + + return ptr ; +} + +Task::~TaskMember() +{ +} + +//---------------------------------------------------------------------------- + +void Task::reschedule() +{ + // Reschedule transitions from executing back to waiting. + const int old_state = atomic_compare_exchange( & m_state , int(TASK_STATE_EXECUTING) , int(TASK_STATE_WAITING) ); + + if ( old_state != int(TASK_STATE_EXECUTING) ) { + +fprintf( stderr + , "reschedule ERROR task[%lx] state(%d)\n" + , (unsigned long) this + , old_state + ); +fflush(stderr); + + } +} + +void Task::schedule() +{ + //---------------------------------------- + // State is either constructing or already waiting. + // If constructing then transition to waiting. + + { + const int old_state = atomic_compare_exchange( & m_state , int(TASK_STATE_CONSTRUCTING) , int(TASK_STATE_WAITING) ); + Task * const waitTask = *((Task * volatile const *) & m_wait ); + Task * const next = *((Task * volatile const *) & m_next ); + + if ( s_denied == waitTask || 0 != next || + ( old_state != int(TASK_STATE_CONSTRUCTING) && + old_state != int(TASK_STATE_WAITING) ) ) { + fprintf(stderr,"Task::schedule task(0x%lx) STATE ERROR: state(%d) wait(0x%lx) next(0x%lx)\n" + , (unsigned long) this + , old_state + , (unsigned long) waitTask + , (unsigned long) next ); + fflush(stderr); + Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::Task spawn or respawn state error"); + } + } + + //---------------------------------------- + // Insert this task into another dependence that is not complete + // Push on to the wait queue, fails if ( s_denied == m_dep[i]->m_wait ) + + bool insert_in_ready_queue = true ; + + for ( int i = 0 ; i < m_dep_size && insert_in_ready_queue ; ) { + + Task * const task_dep = m_dep[i] ; + Task * const head_value_old = *((Task * volatile *) & task_dep->m_wait ); + + if ( s_denied == head_value_old ) { + // Wait queue is closed, try again with the next queue + ++i ; + } + else { + + // Wait queue is open and not locked. + // If CAS succeeds then have acquired the lock. + + // Have exclusive access to this task. + // Assign m_next assuming a successfull insertion into the queue. + // Fence the memory assignment before attempting the CAS. + + *((Task * volatile *) & m_next ) = head_value_old ; + + memory_fence(); + + // Attempt to insert this task into the queue + + Task * const wait_queue_head = atomic_compare_exchange( & task_dep->m_wait , head_value_old , this ); + + if ( head_value_old == wait_queue_head ) { + insert_in_ready_queue = false ; + } + } + } + + //---------------------------------------- + // All dependences are complete, insert into the ready list + + if ( insert_in_ready_queue ) { + + // Increment the count of ready tasks. + // Count is decremented when task is complete. + + Task * volatile * queue = 0 ; + + if ( m_serial ) { + atomic_increment( & s_count_serial ); + queue = & s_ready_serial ; + } + else { + atomic_increment( & s_count_team ); + queue = & s_ready_team ; + } + + while ( insert_in_ready_queue ) { + + Task * const head_value_old = *queue ; + + if ( s_lock != head_value_old ) { + // Read the head of ready queue, if same as previous value then CAS locks the ready queue + // Only access via CAS + + // Have exclusive access to this task, assign to head of queue, assuming successful insert + // Fence assignment before attempting insert. + *((Task * volatile *) & m_next ) = head_value_old ; + + memory_fence(); + + Task * const ready_queue_head = atomic_compare_exchange( queue , head_value_old , this ); + + if ( head_value_old == ready_queue_head ) { + // Successful insert + insert_in_ready_queue = false ; // done + } + } + } + } +} + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + +void Task::assign( Task ** const lhs_ptr , Task * rhs ) +{ + // Increment rhs reference count. + if ( rhs ) { atomic_increment( & rhs->m_ref_count ); } + + // Assign the pointer and retrieve the previous value. + + Task * const old_lhs = atomic_exchange( lhs_ptr , rhs ); + + if ( old_lhs ) { + + // Decrement former lhs reference count. + // If reference count is zero task must be complete, then delete task. + // Task is ready for deletion when wait == s_denied + + int const count = atomic_fetch_add( & (old_lhs->m_ref_count) , -1 ) - 1 ; + + // if 'count != 0' then 'old_lhs' may be deallocated before dereferencing + Task * const wait = count == 0 ? *((Task * const volatile *) & old_lhs->m_wait ) : (Task*) 0 ; + + if ( count < 0 || ( count == 0 && wait != s_denied ) ) { + + static const char msg_error_header[] = "Kokkos::Impl::TaskManager<Kokkos::Threads>::assign ERROR deleting" ; + + fprintf( stderr , "%s task(0x%lx) m_ref_count(%d) , m_wait(0x%ld)\n" + , msg_error_header + , (unsigned long) old_lhs + , count + , (unsigned long) wait ); + fflush(stderr); + + Kokkos::Impl::throw_runtime_exception( msg_error_header ); + } + + if ( count == 0 ) { + // When 'count == 0' this thread has exclusive access to 'old_lhs' + const Task::function_dealloc_type d = old_lhs->m_dealloc ; + (*d)( old_lhs ); + } + } +} + +#endif + +//---------------------------------------------------------------------------- + +Task * Task::get_dependence( int i ) const +{ + Task * const t = m_dep[i] ; + + if ( Kokkos::Experimental::TASK_STATE_EXECUTING != m_state || i < 0 || m_dep_size <= i || 0 == t ) { + +fprintf( stderr + , "TaskMember< Threads >::get_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) dep[%d] = %lx }\n" + , (unsigned long) this + , m_state + , m_dep_size + , i + , (unsigned long) t + ); +fflush( stderr ); + + Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::get_dependence ERROR"); + } + + return t ; +} + +//---------------------------------------------------------------------------- + +void Task::add_dependence( Task * before ) +{ + if ( before != 0 ) { + + int const state = *((volatile const int *) & m_state ); + + // Can add dependence during construction or during execution + + if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == state || + Kokkos::Experimental::TASK_STATE_EXECUTING == state ) && + m_dep_size < m_dep_capacity ) { + + ++m_dep_size ; + + assign( m_dep + (m_dep_size-1) , before ); + + memory_fence(); + } + else { + +fprintf( stderr + , "TaskMember< Threads >::add_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) m_dep_capacity(%d) }\n" + , (unsigned long) this + , m_state + , m_dep_size + , m_dep_capacity + ); +fflush( stderr ); + + Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::add_dependence ERROR"); + } + } +} + +//---------------------------------------------------------------------------- + +void Task::clear_dependence() +{ + for ( int i = m_dep_size - 1 ; 0 <= i ; --i ) { + assign( m_dep + i , 0 ); + } + + *((volatile int *) & m_dep_size ) = 0 ; + + memory_fence(); +} + +//---------------------------------------------------------------------------- + +Task * Task::pop_ready_task( Task * volatile * const queue ) +{ + Task * const task_old = *queue ; + + if ( s_lock != task_old && 0 != task_old ) { + + Task * const task = atomic_compare_exchange( queue , task_old , s_lock ); + + if ( task_old == task ) { + + // May have acquired the lock and task. + // One or more other threads may have acquired this same task and lock + // due to respawning ABA race condition. + // Can only be sure of acquire with a successful state transition from waiting to executing + + const int old_state = atomic_compare_exchange( & task->m_state, int(TASK_STATE_WAITING), int(TASK_STATE_EXECUTING) ); + + if ( old_state == int(TASK_STATE_WAITING) ) { + + // Transitioned this task from waiting to executing + // Update the queue to the next entry and release the lock + + Task * const next_old = *((Task * volatile *) & task->m_next ); + + Task * const s = atomic_compare_exchange( queue , s_lock , next_old ); + + if ( s != s_lock ) { + fprintf(stderr,"Task::pop_ready_task( 0x%lx ) UNLOCK ERROR\n", (unsigned long) queue ); + fflush(stderr); + } + + *((Task * volatile *) & task->m_next ) = 0 ; + + return task ; + } + else { + fprintf(stderr,"Task::pop_ready_task( 0x%lx ) task(0x%lx) state(%d) ERROR\n" + , (unsigned long) queue + , (unsigned long) task + , old_state ); + fflush(stderr); + } + } + } + + return (Task *) 0 ; +} + + +void Task::complete_executed_task( Task * task , volatile int * const queue_count ) +{ + // State is either executing or if respawned then waiting, + // try to transition from executing to complete. + // Reads the current value. + + const int state_old = + atomic_compare_exchange( & task->m_state + , int(Kokkos::Experimental::TASK_STATE_EXECUTING) + , int(Kokkos::Experimental::TASK_STATE_COMPLETE) ); + + if ( Kokkos::Experimental::TASK_STATE_WAITING == state_old ) { + task->schedule(); /* Task requested a respawn so reschedule it */ + } + else if ( Kokkos::Experimental::TASK_STATE_EXECUTING != state_old ) { + fprintf( stderr + , "TaskMember< Threads >::execute_serial completion ERROR : task[%lx]{ state_old(%d) dep_size(%d) }\n" + , (unsigned long) & task + , state_old + , task->m_dep_size + ); + fflush( stderr ); + } + else { + + // Clear dependences of this task before locking wait queue + + task->clear_dependence(); + + // Stop other tasks from adding themselves to this task's wait queue. + // The wait queue is updated concurrently so guard with an atomic. + // Setting the wait queue to denied denotes delete-ability of the task by any thread. + // Therefore, once 'denied' the task pointer must be treated as invalid. + + Task * wait_queue = *((Task * volatile *) & task->m_wait ); + Task * wait_queue_old = 0 ; + + do { + wait_queue_old = wait_queue ; + wait_queue = atomic_compare_exchange( & task->m_wait , wait_queue_old , s_denied ); + } while ( wait_queue_old != wait_queue ); + + task = 0 ; + + // Pop waiting tasks and schedule them + while ( wait_queue ) { + Task * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ; + x->schedule(); + } + } + + atomic_decrement( queue_count ); +} + +//---------------------------------------------------------------------------- + +void Task::execute_ready_tasks_driver( Kokkos::Impl::ThreadsExec & exec , const void * arg ) +{ + typedef Kokkos::Impl::ThreadsExecTeamMember member_type ; + + // Whole pool is calling this function + + // Create the thread team member with shared memory for the given task. + const int team_size = reinterpret_cast<long>( arg ); + + member_type member( & exec , TeamPolicy< Kokkos::Threads >( 1 , team_size ) , 0 ); + + Kokkos::Impl::ThreadsExec & exec_team_base = member.threads_exec_team_base(); + + Task * volatile * const task_team_ptr = reinterpret_cast<Task**>( exec_team_base.reduce_memory() ); + + if ( member.team_fan_in() ) { + *task_team_ptr = 0 ; + Kokkos::memory_fence(); + } + member.team_fan_out(); + + long int iteration_count = 0 ; + + // Each team must iterate this loop synchronously to insure team-execution of team-task + + while ( 0 < s_count_serial || 0 < s_count_team ) { + + if ( member.team_rank() == 0 ) { + // Only one team member attempts to pop a team task + *task_team_ptr = pop_ready_task( & s_ready_team ); + } + + // Query if team acquired a team task + Task * const task_team = *task_team_ptr ; + + if ( task_team ) { + // Set shared memory + member.set_league_shmem( 0 , 1 , task_team->m_shmem_size ); + + (*task_team->m_team)( task_team , member ); + + // Do not proceed until all members have completed the task, + // the task has been completed or rescheduled, and + // the team task pointer has been cleared. + if ( member.team_fan_in() ) { + complete_executed_task( task_team , & s_count_team ); + *task_team_ptr = 0 ; + Kokkos::memory_fence(); + } + member.team_fan_out(); + } + else { + Task * const task_serial = pop_ready_task( & s_ready_serial ); + + if ( task_serial ) { + if ( task_serial->m_serial ) (*task_serial->m_serial)( task_serial ); + + complete_executed_task( task_serial , & s_count_serial ); + } + } + + ++iteration_count ; + } + + exec.fan_in(); +} + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + +#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */ + diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp new file mode 100755 index 0000000000000000000000000000000000000000..024671324007da6f3dc668b113012234be73d77c --- /dev/null +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp @@ -0,0 +1,584 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_THREADS_TASKPOLICY_HPP +#define KOKKOS_THREADS_TASKPOLICY_HPP + + +#include <Kokkos_Threads.hpp> +#include <Kokkos_TaskPolicy.hpp> + +#if defined( KOKKOS_HAVE_PTHREAD ) + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +/** \brief Base class for all Kokkos::Threads tasks */ +template<> +class TaskMember< Kokkos::Threads , void , void > { +public: + + typedef void (* function_dealloc_type)( TaskMember * ); + typedef TaskMember * (* function_verify_type) ( TaskMember * ); + typedef void (* function_single_type) ( TaskMember * ); + typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::ThreadsExecTeamMember & ); + +private: + + // Needed to disambiguate references to base class variables + // without triggering a false-positive on Intel compiler warning #955. + typedef TaskMember< Kokkos::Threads , void , void > SelfType ; + + function_dealloc_type m_dealloc ; ///< Deallocation + function_verify_type m_verify ; ///< Result type verification + function_team_type m_team ; ///< Apply function + function_single_type m_serial ; ///< Apply function + TaskMember ** m_dep ; ///< Dependences + TaskMember * m_wait ; ///< Linked list of tasks waiting on this task + TaskMember * m_next ; ///< Linked list of tasks waiting on a different task + int m_dep_capacity ; ///< Capacity of dependences + int m_dep_size ; ///< Actual count of dependences + int m_shmem_size ; + int m_ref_count ; ///< Reference count + int m_state ; ///< State of the task + + // 7 pointers + 5 integers + +#if defined( KOKKOS_HAVE_CXX11 ) + TaskMember( const TaskMember & ) = delete ; + TaskMember & operator = ( const TaskMember & ) = delete ; +#else + TaskMember( const TaskMember & ); + TaskMember & operator = ( const TaskMember & ); +#endif + + static void * allocate( const unsigned arg_size ); + static void deallocate( void * ); + + template< class DerivedTaskType > + static + void deallocate( TaskMember * t ) + { + DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t); + ptr->~DerivedTaskType(); + deallocate( (void*) ptr ); + } + + static TaskMember * pop_ready_task( TaskMember * volatile * const queue ); + static void complete_executed_task( TaskMember * , volatile int * const ); + + static void throw_error_verify_type(); + +protected: + + TaskMember() + : m_dealloc(0) + , m_verify(0) + , m_team(0) + , m_serial(0) + , m_dep(0) + , m_wait(0) + , m_next(0) + , m_dep_capacity(0) + , m_dep_size(0) + , m_shmem_size(0) + , m_ref_count(0) + , m_state(0) + {} + +public: + + static void execute_ready_tasks_driver( Kokkos::Impl::ThreadsExec & , const void * ); + + ~TaskMember(); + + template< typename ResultType > + KOKKOS_FUNCTION static + TaskMember * verify_type( TaskMember * t ) + { + enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value }; + + if ( check_type && t != 0 ) { + + // Verify that t->m_verify is this function + const function_verify_type self = & TaskMember::template verify_type< ResultType > ; + + if ( t->m_verify != self ) { + t = 0 ; +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + throw_error_verify_type(); +#endif + } + } + return t ; + } + + //---------------------------------------- + /* Inheritence Requirements on task types: + * + * class DerivedTaskType + * : public TaskMember< Threads , DerivedType::value_type , FunctorType > + * { ... }; + * + * class TaskMember< Threads , DerivedType::value_type , FunctorType > + * : public TaskMember< Threads , DerivedType::value_type , void > + * , public Functor + * { ... }; + * + * If value_type != void + * class TaskMember< Threads , value_type , void > + * : public TaskMember< Threads , void , void > + * + * Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ] + * + */ + //---------------------------------------- + + template< class DerivedTaskType , class Tag > + KOKKOS_FUNCTION static + void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< typename DerivedTaskType::result_type , void >::value + , TaskMember * >::type t ) + { + typedef typename DerivedTaskType::functor_type functor_type ; + typedef typename DerivedTaskType::result_type result_type ; + + DerivedTaskType & self = * static_cast< DerivedTaskType * >(t); + + Kokkos::Impl::FunctorApply< functor_type , Tag , result_type & > + ::apply( (functor_type &) self , & self.m_result ); + } + + template< class DerivedTaskType , class Tag > + KOKKOS_FUNCTION static + void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< typename DerivedTaskType::result_type , void >::value + , TaskMember * >::type t ) + { + typedef typename DerivedTaskType::functor_type functor_type ; + + DerivedTaskType & self = * static_cast< DerivedTaskType * >(t); + + Kokkos::Impl::FunctorApply< functor_type , Tag , void >::apply( (functor_type &) self ); + } + + //---------------------------------------- + + template< class DerivedTaskType , class Tag > + KOKKOS_FUNCTION static + void apply_team( typename Kokkos::Impl::enable_if<( + Kokkos::Impl::is_same<Tag,void>::value + && + Kokkos::Impl::is_same<typename DerivedTaskType::result_type,void>::value + ), TaskMember * >::type t + , Kokkos::Impl::ThreadsExecTeamMember & member + ) + { + DerivedTaskType & self = * static_cast< DerivedTaskType * >(t); + + self.DerivedTaskType::functor_type::apply( member ); + } + + /** \brief Allocate and construct a task */ + template< class DerivedTaskType , class Tag > + KOKKOS_FUNCTION static + void apply_team( typename Kokkos::Impl::enable_if<( + Kokkos::Impl::is_same<Tag,void>::value + && + ! Kokkos::Impl::is_same<typename DerivedTaskType::result_type,void>::value + ), TaskMember * >::type t + , Kokkos::Impl::ThreadsExecTeamMember & member + ) + { + DerivedTaskType & self = * static_cast< DerivedTaskType * >(t); + + self.DerivedTaskType::functor_type::apply( member , self.m_result ); + } + + //---------------------------------------- + + /** \brief Allocate and construct a task */ + template< class DerivedTaskType , class Tag > + static + TaskMember * create( const typename DerivedTaskType::functor_type & arg_functor + , const function_team_type arg_apply_team + , const function_single_type arg_apply_single + , const unsigned arg_team_shmem + , const unsigned arg_dependence_capacity + ) + { + enum { padding_size = sizeof(DerivedTaskType) % sizeof(TaskMember*) + ? sizeof(TaskMember*) - sizeof(DerivedTaskType) % sizeof(TaskMember*) : 0 }; + enum { derived_size = sizeof(DerivedTaskType) + padding_size }; + + DerivedTaskType * const task = + new( allocate( derived_size + sizeof(TaskMember*) * arg_dependence_capacity ) ) + DerivedTaskType( arg_functor ); + + task->SelfType::m_dealloc = & TaskMember::template deallocate< DerivedTaskType > ; + task->SelfType::m_verify = & TaskMember::template verify_type< typename DerivedTaskType::value_type > ; + task->SelfType::m_team = arg_apply_team ; + task->SelfType::m_serial = arg_apply_single ; + task->SelfType::m_dep = (TaskMember**)( ((unsigned char *)task) + derived_size ); + task->SelfType::m_dep_capacity = arg_dependence_capacity ; + task->SelfType::m_shmem_size = arg_team_shmem ; + task->SelfType::m_state = TASK_STATE_CONSTRUCTING ; + + for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) task->SelfType::m_dep[i] = 0 ; + + return static_cast< TaskMember * >( task ); + } + + void reschedule(); + void schedule(); + + //---------------------------------------- + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + static + void assign( TaskMember ** const lhs , TaskMember * const rhs ); +#else + KOKKOS_INLINE_FUNCTION static + void assign( TaskMember ** const lhs , TaskMember * const rhs ) {} +#endif + + TaskMember * get_dependence( int i ) const ; + + KOKKOS_INLINE_FUNCTION + int get_dependence() const + { return m_dep_size ; } + + void clear_dependence(); + void add_dependence( TaskMember * before ); + + //---------------------------------------- + + typedef FutureValueTypeIsVoidError get_result_type ; + + KOKKOS_INLINE_FUNCTION + get_result_type get() const { return get_result_type() ; } + + KOKKOS_INLINE_FUNCTION + Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); } + +}; + +/** \brief A Future< Kokkos::Threads , ResultType > will cast + * from TaskMember< Kokkos::Threads , void , void > + * to TaskMember< Kokkos::Threads , ResultType , void > + * to query the result. + */ +template< class ResultType > +class TaskMember< Kokkos::Threads , ResultType , void > + : public TaskMember< Kokkos::Threads , void , void > +{ +public: + + typedef ResultType result_type ; + + result_type m_result ; + + typedef const result_type & get_result_type ; + + KOKKOS_INLINE_FUNCTION + get_result_type get() const { return m_result ; } + + inline + TaskMember() : TaskMember< Kokkos::Threads , void , void >(), m_result() {} + +#if defined( KOKKOS_HAVE_CXX11 ) + TaskMember( const TaskMember & ) = delete ; + TaskMember & operator = ( const TaskMember & ) = delete ; +#else +private: + TaskMember( const TaskMember & ); + TaskMember & operator = ( const TaskMember & ); +#endif +}; + +/** \brief Callback functions will cast + * from TaskMember< Kokkos::Threads , void , void > + * to TaskMember< Kokkos::Threads , ResultType , FunctorType > + * to execute work functions. + */ +template< class ResultType , class FunctorType > +class TaskMember< Kokkos::Threads , ResultType , FunctorType > + : public TaskMember< Kokkos::Threads , ResultType , void > + , public FunctorType +{ +public: + typedef ResultType result_type ; + typedef FunctorType functor_type ; + + inline + TaskMember( const functor_type & arg_functor ) + : TaskMember< Kokkos::Threads , ResultType , void >() + , functor_type( arg_functor ) + {} +}; + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +void wait( TaskPolicy< Kokkos::Threads > & ); + +template<> +class TaskPolicy< Kokkos::Threads > +{ +public: + + typedef Kokkos::Threads execution_space ; + typedef TaskPolicy execution_policy ; + typedef Kokkos::Impl::ThreadsExecTeamMember member_type ; + +private: + + typedef Impl::TaskMember< Kokkos::Threads , void , void > task_root_type ; + + int m_default_dependence_capacity ; + int m_team_size ; ///< Fixed size of a task-team + + template< class FunctorType > + static inline + const task_root_type * get_task_root( const FunctorType * f ) + { + typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ; + return static_cast< const task_root_type * >( static_cast< const task_type * >(f) ); + } + + template< class FunctorType > + static inline + task_root_type * get_task_root( FunctorType * f ) + { + typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ; + return static_cast< task_root_type * >( static_cast< task_type * >(f) ); + } + +public: + + // Valid team sizes are 1, + // Threads::pool_size(1) == threads per numa, or + // Threads::pool_size(2) == threads per core + + TaskPolicy( const unsigned arg_default_dependence_capacity = 4 + , const unsigned arg_team_size = 0 /* default from thread pool topology */ + ); + + KOKKOS_INLINE_FUNCTION + TaskPolicy( const TaskPolicy & rhs ) + : m_default_dependence_capacity( rhs.m_default_dependence_capacity ) + , m_team_size( rhs.m_team_size ) + {} + + KOKKOS_INLINE_FUNCTION + TaskPolicy( const TaskPolicy & rhs + , const unsigned arg_default_dependence_capacity ) + : m_default_dependence_capacity( arg_default_dependence_capacity ) + , m_team_size( rhs.m_team_size ) + {} + + TaskPolicy & operator = ( const TaskPolicy &rhs ) { + m_default_dependence_capacity = rhs.m_default_dependence_capacity; + m_team_size = rhs.m_team_size; + return *this; + } + + // Create serial-thread task + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + Future< typename FunctorType::value_type , execution_space > + create( const FunctorType & functor + , const unsigned dependence_capacity = ~0u ) const + { + typedef typename FunctorType::value_type value_type ; + typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ; + + return Future< value_type , execution_space >( +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + task_root_type::create< task_type , void > + ( functor + , task_root_type::function_team_type(0) + , & task_root_type::template apply_single< task_type , void > + , 0 + , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) + ) +#endif + ); + } + + // Create thread-team task + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + Future< typename FunctorType::value_type , execution_space > + create_team( const FunctorType & functor + , const unsigned dependence_capacity = ~0u ) const + { + typedef typename FunctorType::value_type value_type ; + typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ; + + return Future< value_type , execution_space >( +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + task_root_type::create< task_type , void > + ( functor + , & task_root_type::template apply_team< task_type , void > + , task_root_type::function_single_type(0) + , Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value( functor , m_team_size ) + , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) + ) +#endif + ); + } + + template< class A1 , class A2 , class A3 , class A4 > + KOKKOS_INLINE_FUNCTION + void add_dependence( const Future<A1,A2> & after + , const Future<A3,A4> & before + , typename Kokkos::Impl::enable_if + < Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value + && + Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value + >::type * = 0 + ) const + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + after.m_task->add_dependence( before.m_task ); +#endif + } + + template< class FunctorType , class A3 , class A4 > + KOKKOS_INLINE_FUNCTION + void add_dependence( FunctorType * task_functor + , const Future<A3,A4> & before + , typename Kokkos::Impl::enable_if + < Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value + >::type * = 0 + ) const +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { get_task_root(task_functor)->add_dependence( before.m_task ); } +#else + {} +#endif + + + template< class ValueType > + const Future< ValueType , execution_space > & + spawn( const Future< ValueType , execution_space > & f ) const + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + f.m_task->schedule(); +#endif + return f ; + } + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + void respawn( FunctorType * task_functor ) const +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { get_task_root(task_functor)->reschedule(); } +#else + {} +#endif + + //---------------------------------------- + // Functions for an executing task functor to query dependences, + // set new dependences, and respawn itself. + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + Future< void , execution_space > + get_dependence( const FunctorType * task_functor , int i ) const + { + return Future<void,execution_space>( +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + get_task_root(task_functor)->get_dependence(i) +#endif + ); + } + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + int get_dependence( const FunctorType * task_functor ) const +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return get_task_root(task_functor)->get_dependence(); } +#else + { return 0 ; } +#endif + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + void clear_dependence( FunctorType * task_functor ) const +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { get_task_root(task_functor)->clear_dependence(); } +#else + {} +#endif + + //---------------------------------------- + + static member_type & member_single(); + + friend void wait( TaskPolicy< Kokkos::Threads > & ); +}; + +} /* namespace Experimental */ +} /* namespace Kokkos */ + +#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */ + +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */ + + diff --git a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.cpp b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.cpp new file mode 100755 index 0000000000000000000000000000000000000000..50168fe3cc2db08069d94f75cb86bb1917f3eafe --- /dev/null +++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.cpp @@ -0,0 +1,275 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +bool +SharedAllocationRecord< void , void >:: +is_sane( SharedAllocationRecord< void , void > * arg_record ) +{ + constexpr static SharedAllocationRecord * zero = 0 ; + + SharedAllocationRecord * const root = arg_record ? arg_record->m_root : 0 ; + + bool ok = root != 0 && root->m_count == 0 ; + + if ( ok ) { + SharedAllocationRecord * root_next = 0 ; + + // Lock the list: + while ( ( root_next = Kokkos::atomic_exchange( & root->m_next , zero ) ) == 0 ); + + for ( SharedAllocationRecord * rec = root_next ; ok && rec != root ; rec = rec->m_next ) { + const bool ok_non_null = rec && rec->m_prev && ( rec == root || rec->m_next ); + const bool ok_root = ok_non_null && rec->m_root == root ; + const bool ok_prev_next = ok_non_null && ( rec->m_prev != root ? rec->m_prev->m_next == rec : root_next == rec ); + const bool ok_next_prev = ok_non_null && rec->m_next->m_prev == rec ; + const bool ok_count = ok_non_null && 0 <= rec->m_count ; + + ok = ok_root && ok_prev_next && ok_next_prev && ok_count ; + +if ( ! ok ) { + fprintf(stderr,"Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12lx){ m_count(%d) m_root(0x%.12lx) m_next(0x%.12lx) m_prev(0x%.12lx) m_next->m_prev(0x%.12lx) m_prev->m_next(0x%.12lx) }\n" + , reinterpret_cast< unsigned long >( rec ) + , rec->m_count + , reinterpret_cast< unsigned long >( rec->m_root ) + , reinterpret_cast< unsigned long >( rec->m_next ) + , reinterpret_cast< unsigned long >( rec->m_prev ) + , reinterpret_cast< unsigned long >( rec->m_next->m_prev ) + , reinterpret_cast< unsigned long >( rec->m_prev != rec->m_root ? rec->m_prev->m_next : root_next ) + ); +} + + } + + if ( zero != Kokkos::atomic_exchange( & root->m_next , root_next ) ) { + Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane unlocking"); + } + } + + return ok ; +} + +SharedAllocationRecord<void,void> * +SharedAllocationRecord<void,void>::find( SharedAllocationRecord<void,void> * const arg_root , void * const arg_data_ptr ) +{ + constexpr static SharedAllocationRecord * zero = 0 ; + + SharedAllocationRecord * root_next = 0 ; + + // Lock the list: + while ( ( root_next = Kokkos::atomic_exchange( & arg_root->m_next , 0 ) ) == 0 ); + + // Iterate searching for the record with this data pointer + + SharedAllocationRecord * r = root_next ; + + while ( ( r != arg_root ) && ( r->data() != arg_data_ptr ) ) { r = r->m_next ; } + + if ( r == arg_root ) { r = 0 ; } + + if ( zero != Kokkos::atomic_exchange( & arg_root->m_next , root_next ) ) { + Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed locking/unlocking"); + } + + return r ; +} + + +/**\brief Construct and insert into 'arg_root' tracking set. + * use_count is zero. + */ +SharedAllocationRecord< void , void >:: +SharedAllocationRecord( SharedAllocationRecord<void,void> * arg_root + , SharedAllocationHeader * arg_alloc_ptr + , size_t arg_alloc_size + , SharedAllocationRecord< void , void >::function_type arg_dealloc + ) + : m_alloc_ptr( arg_alloc_ptr ) + , m_alloc_size( arg_alloc_size ) + , m_dealloc( arg_dealloc ) + , m_root( arg_root ) + , m_prev( 0 ) + , m_next( 0 ) + , m_count( 0 ) +{ + constexpr static SharedAllocationRecord * zero = 0 ; + + // Insert into the root double-linked list for tracking + // + // before: arg_root->m_next == next ; next->m_prev == arg_root + // after: arg_root->m_next == this ; this->m_prev == arg_root ; + // this->m_next == next ; next->m_prev == this + + m_prev = m_root ; + + // Read root->m_next and lock by setting to zero + while ( ( m_next = Kokkos::atomic_exchange( & m_root->m_next , zero ) ) == 0 ); + + m_next->m_prev = this ; + + if ( zero != Kokkos::atomic_exchange( & m_root->m_next , this ) ) { + Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed locking/unlocking"); + } +} + +void +SharedAllocationRecord< void , void >:: +increment( SharedAllocationRecord< void , void > * arg_record ) +{ + const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , 1 ); + + if ( old_count < 0 ) { // Error + Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed increment"); + } +} + +SharedAllocationRecord< void , void > * +SharedAllocationRecord< void , void >:: +decrement( SharedAllocationRecord< void , void > * arg_record ) +{ + constexpr static SharedAllocationRecord * zero = 0 ; + + const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , -1 ); + + if ( old_count == 1 ) { + + // before: arg_record->m_prev->m_next == arg_record && + // arg_record->m_next->m_prev == arg_record + // + // after: arg_record->m_prev->m_next == arg_record->m_next && + // arg_record->m_next->m_prev == arg_record->m_prev + + SharedAllocationRecord * root_next = 0 ; + + // Lock the list: + while ( ( root_next = Kokkos::atomic_exchange( & arg_record->m_root->m_next , 0 ) ) == 0 ); + + arg_record->m_next->m_prev = arg_record->m_prev ; + + if ( root_next != arg_record ) { + arg_record->m_prev->m_next = arg_record->m_next ; + } + else { + // before: arg_record->m_root == arg_record->m_prev + // after: arg_record->m_root == arg_record->m_next + root_next = arg_record->m_next ; + } + + // Unlock the list: + if ( zero != Kokkos::atomic_exchange( & arg_record->m_root->m_next , root_next ) ) { + Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed decrement unlocking"); + } + + arg_record->m_next = 0 ; + arg_record->m_prev = 0 ; + + function_type d = arg_record->m_dealloc ; + (*d)( arg_record ); + arg_record = 0 ; + } + else if ( old_count < 1 ) { // Error + Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed decrement count"); + } + + return arg_record ; +} + +void +SharedAllocationRecord< void , void >:: +print_host_accessible_records( std::ostream & s + , const char * const space_name + , const SharedAllocationRecord * const root + , const bool detail ) +{ + const SharedAllocationRecord< void , void > * r = root ; + + char buffer[256] ; + + if ( detail ) { + do { + + snprintf( buffer , 256 , "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n" + , space_name + , reinterpret_cast<unsigned long>( r ) + , reinterpret_cast<unsigned long>( r->m_prev ) + , reinterpret_cast<unsigned long>( r->m_next ) + , reinterpret_cast<unsigned long>( r->m_alloc_ptr ) + , r->m_alloc_size + , r->m_count + , reinterpret_cast<unsigned long>( r->m_dealloc ) + , r->m_alloc_ptr->m_label + ); + std::cout << buffer ; + r = r->m_next ; + } while ( r != root ); + } + else { + do { + if ( r->m_alloc_ptr ) { + + snprintf( buffer , 256 , "%s [ 0x%.12lx + %ld ] %s\n" + , space_name + , reinterpret_cast< unsigned long >( r->data() ) + , r->size() + , r->m_alloc_ptr->m_label + ); + } + else { + snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name ); + } + std::cout << buffer ; + r = r->m_next ; + } while ( r != root ); + } +} + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + + diff --git a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp new file mode 100755 index 0000000000000000000000000000000000000000..d9491b55329ca561af8df7e540848b158e3da4fe --- /dev/null +++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp @@ -0,0 +1,287 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template< class MemorySpace = void , class DestroyFunctor = void > +class SharedAllocationRecord ; + +class SharedAllocationHeader { +private: + + typedef SharedAllocationRecord<void,void> Record ; + + static constexpr unsigned maximum_label_length = ( 1u << 7 /* 128 */ ) - sizeof(Record*); + + template< class , class > friend class SharedAllocationRecord ; + + Record * m_record ; + char m_label[ maximum_label_length ]; + +public: + + /* Given user memory get pointer to the header */ + KOKKOS_INLINE_FUNCTION static + const SharedAllocationHeader * get_header( void * alloc_ptr ) + { return reinterpret_cast<SharedAllocationHeader*>( reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader) ); } +}; + +template<> +class SharedAllocationRecord< void , void > { +protected: + + static_assert( sizeof(SharedAllocationHeader) == ( 1u << 7 /* 128 */ ) , "sizeof(SharedAllocationHeader) != 128" ); + + template< class , class > friend class SharedAllocationRecord ; + + typedef void (* function_type )( SharedAllocationRecord<void,void> * ); + + SharedAllocationHeader * const m_alloc_ptr ; + size_t const m_alloc_size ; + function_type const m_dealloc ; + SharedAllocationRecord * const m_root ; + SharedAllocationRecord * m_prev ; + SharedAllocationRecord * m_next ; + int m_count ; + + SharedAllocationRecord( const SharedAllocationRecord & ) = delete ; + SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ; + + /**\brief Construct and insert into 'arg_root' tracking set. + * use_count is zero. + */ + SharedAllocationRecord( SharedAllocationRecord * arg_root + , SharedAllocationHeader * arg_alloc_ptr + , size_t arg_alloc_size + , function_type arg_dealloc + ); + +public: + + ~SharedAllocationRecord() = default ; + + constexpr SharedAllocationRecord() + : m_alloc_ptr( 0 ) + , m_alloc_size( 0 ) + , m_dealloc( 0 ) + , m_root( this ) + , m_prev( this ) + , m_next( this ) + , m_count( 0 ) + {} + + static constexpr unsigned maximum_label_length = SharedAllocationHeader::maximum_label_length ; + + KOKKOS_INLINE_FUNCTION + const SharedAllocationHeader * head() const { return m_alloc_ptr ; } + + /* User's memory begins at the end of the header */ + KOKKOS_INLINE_FUNCTION + void * data() const { return reinterpret_cast<void*>( m_alloc_ptr + 1 ); } + + /* User's memory begins at the end of the header */ + constexpr size_t size() const { return m_alloc_size - sizeof(SharedAllocationHeader) ; } + + /* Cannot be 'constexpr' because 'm_count' is volatile */ + int use_count() const { return m_count ; } + + /* Increment use count */ + static void increment( SharedAllocationRecord * ); + + /* Decrement use count. If 1->0 then remove from the tracking list and invoke m_dealloc */ + static SharedAllocationRecord * decrement( SharedAllocationRecord * ); + + /* Given a root record and data pointer find the record */ + static SharedAllocationRecord * find( SharedAllocationRecord * const , void * const ); + + /* Sanity check for the whole set of records to which the input record belongs. + * Locks the set's insert/erase operations until the sanity check is complete. + */ + static bool is_sane( SharedAllocationRecord * ); + + /* Print host-accessible records */ + static void print_host_accessible_records( std::ostream & + , const char * const space_name + , const SharedAllocationRecord * const root + , const bool detail ); +}; + +/* + * Memory space specialization of SharedAllocationRecord< Space , void > requires : + * + * SharedAllocationRecord< Space , void > : public SharedAllocationRecord< void , void > + * { + * // delete allocated user memory via static_cast to this type. + * static void deallocate( const SharedAllocationRecord<void,void> * ); + * Space m_space ; + * } + */ + +template< class MemorySpace , class DestroyFunctor > +class SharedAllocationRecord : public SharedAllocationRecord< MemorySpace , void > +{ +private: + + static void deallocate( SharedAllocationRecord<void,void> * record_ptr ) + { delete static_cast<SharedAllocationRecord<MemorySpace,DestroyFunctor>*>(record_ptr); } + + SharedAllocationRecord( const MemorySpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc + ) + /* Allocate user memory as [ SharedAllocationHeader , user_memory ] */ + : SharedAllocationRecord< MemorySpace , void >( arg_space , arg_label , arg_alloc , & deallocate ) + , m_destroy() + {} + + ~SharedAllocationRecord() { m_destroy.destroy_shared_allocation(); } + +public: + + DestroyFunctor m_destroy ; + + // Allocate with a zero use count. Incrementing the use count from zero to one + // inserts the record into the tracking list. Decrementing the count from one to zero + // removes from the trakcing list and deallocates. + KOKKOS_INLINE_FUNCTION static + SharedAllocationRecord * allocate( const MemorySpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc + ) + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + return new SharedAllocationRecord( arg_space , arg_label , arg_alloc ); +#else + return (SharedAllocationRecord *) 0 ; +#endif + } +}; + +union SharedAllocationTracker { +private: + + typedef SharedAllocationRecord<void,void> Record ; + + enum : unsigned long { + DO_NOT_DEREF_FLAG = 0x01ul + }; + + // The allocation record resides in Host memory space + Record * m_record ; + unsigned long m_record_bits; + + KOKKOS_INLINE_FUNCTION + static Record * disable( Record * rec ) + { return reinterpret_cast<Record*>( reinterpret_cast<unsigned long>( rec ) & DO_NOT_DEREF_FLAG ); } + + KOKKOS_INLINE_FUNCTION + void increment() const + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::increment( m_record ); +#endif + } + + KOKKOS_INLINE_FUNCTION + void decrement() const + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::decrement( m_record ); +#endif + } + +public: + + KOKKOS_INLINE_FUNCTION + constexpr SharedAllocationTracker() : m_record_bits( DO_NOT_DEREF_FLAG ) {} + + template< class MemorySpace > + constexpr + SharedAllocationRecord< MemorySpace , void > & get_record() const + { return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); } + + template< class MemorySpace > + std::string get_label() const + { return static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record )->get_label(); } + + KOKKOS_INLINE_FUNCTION + SharedAllocationTracker( Record * arg_record ) + : m_record( arg_record ) { increment(); } + + KOKKOS_INLINE_FUNCTION + ~SharedAllocationTracker() { decrement(); } + + KOKKOS_INLINE_FUNCTION + SharedAllocationTracker( const SharedAllocationTracker & rhs ) + : m_record( rhs.m_record ) { increment(); } + + KOKKOS_INLINE_FUNCTION + SharedAllocationTracker( SharedAllocationTracker && rhs ) + : m_record( rhs.m_record ) { rhs.m_record_bits = DO_NOT_DEREF_FLAG ; } + + KOKKOS_INLINE_FUNCTION + SharedAllocationTracker & operator = ( const SharedAllocationTracker & rhs ) + { + decrement(); + m_record = rhs.m_record ; + increment(); + return *this ; + } + + KOKKOS_INLINE_FUNCTION + SharedAllocationTracker & operator = ( SharedAllocationTracker && rhs ) + { + m_record = rhs.m_record ; + rhs.m_record_bits = DO_NOT_DEREF_FLAG ; + return *this ; + } +}; + + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + + diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewAllocProp.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewAllocProp.hpp new file mode 100755 index 0000000000000000000000000000000000000000..348ccaf5ed3bcd8345e05d3880e6cc34badf017b --- /dev/null +++ b/lib/kokkos/core/src/impl/KokkosExp_ViewAllocProp.hpp @@ -0,0 +1,416 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_IMPL_VIEW_ALLOC_PROP_HPP +#define KOKKOS_EXPERIMENTAL_IMPL_VIEW_ALLOC_PROP_HPP + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +struct WithoutInitializing_t {}; +struct AllowPadding_t {}; + +template< class ... Parameters > +struct ViewAllocProp ; + +template<> +struct ViewAllocProp<> { + + struct NullSpace {}; + + typedef std::false_type allow_padding_t ; + typedef std::true_type initialize_t ; + typedef NullSpace memory_space ; + typedef NullSpace execution_space ; + + const std::string label ; + const memory_space memory ; + const execution_space execution ; + const allow_padding_t allow_padding ; + const initialize_t initialize ; + + ViewAllocProp() + : label() + , memory() + , execution() + , allow_padding() + , initialize() + {} + + ViewAllocProp( const std::string & arg_label ) + : label( arg_label ) + , memory() + , execution() + , allow_padding() + , initialize() + {} +}; + +template< class ... Parameters > +struct ViewAllocProp< const char * , Parameters ... > +{ + typedef ViewAllocProp< Parameters ... > base_prop_type ; + + typedef typename base_prop_type::allow_padding_t allow_padding_t ; + typedef typename base_prop_type::initialize_t initialize_t ; + typedef typename base_prop_type::memory_space memory_space ; + typedef typename base_prop_type::execution_space execution_space ; + + const std::string label ; + const memory_space memory ; + const execution_space execution ; + const allow_padding_t allow_padding ; + const initialize_t initialize ; + + ViewAllocProp( const char * const arg_label , Parameters ... arg_param ) + : label( arg_label ) + , memory( base_prop_type( arg_param ... ).memory ) + , execution( base_prop_type( arg_param ... ).execution ) + , allow_padding() + , initialize() + {} +}; + +template< class ... Parameters > +struct ViewAllocProp< std::string , Parameters ... > +{ + typedef ViewAllocProp< Parameters ... > base_prop_type ; + + typedef typename base_prop_type::allow_padding_t allow_padding_t ; + typedef typename base_prop_type::initialize_t initialize_t ; + typedef typename base_prop_type::memory_space memory_space ; + typedef typename base_prop_type::execution_space execution_space ; + + const std::string label ; + const memory_space memory ; + const execution_space execution ; + const allow_padding_t allow_padding ; + const initialize_t initialize ; + + ViewAllocProp( const std::string & arg_label , Parameters ... arg_param ) + : label( arg_label ) + , memory( base_prop_type( arg_param ... ).memory ) + , execution( base_prop_type( arg_param ... ).execution ) + , allow_padding() + , initialize() + {} +}; + +template< class ... Parameters > +struct ViewAllocProp< WithoutInitializing_t , Parameters ... > +{ + typedef ViewAllocProp< Parameters ... > base_prop_type ; + + typedef typename base_prop_type::allow_padding_t allow_padding_t ; + typedef std::false_type initialize_t ; + typedef typename base_prop_type::memory_space memory_space ; + typedef typename base_prop_type::execution_space execution_space ; + + const std::string label ; + const memory_space memory ; + const execution_space execution ; + const allow_padding_t allow_padding ; + const initialize_t initialize ; + + ViewAllocProp( const WithoutInitializing_t & , Parameters ... arg_param ) + : label( base_prop_type( arg_param ... ).label ) + , memory( base_prop_type( arg_param ... ).memory ) + , execution( base_prop_type( arg_param ... ).execution ) + , allow_padding() + , initialize() + {} +}; + +template< class ... Parameters > +struct ViewAllocProp< AllowPadding_t , Parameters ... > +{ + typedef ViewAllocProp< Parameters ... > base_prop_type ; + + typedef std::true_type allow_padding_t ; + typedef typename base_prop_type::initialize_t initialize_t ; + typedef typename base_prop_type::memory_space memory_space ; + typedef typename base_prop_type::execution_space execution_space ; + + const std::string label ; + const memory_space memory ; + const execution_space execution ; + const allow_padding_t allow_padding ; + const initialize_t initialize ; + + ViewAllocProp( const AllowPadding_t & , Parameters ... arg_param ) + : label( base_prop_type( arg_param ... ).label ) + , memory( base_prop_type( arg_param ... ).memory ) + , execution( base_prop_type( arg_param ... ).execution ) + , allow_padding() + , initialize() + {} +}; + +template< class Space , class ... Parameters > +struct ViewAllocProp< Space , Parameters ... > +{ + enum { is_exec = Kokkos::Impl::is_execution_space< Space >::value }; + enum { is_mem = Kokkos::Impl::is_memory_space< Space >::value }; + + static_assert( is_exec || is_mem , "View allocation given unknown parameter" ); + + typedef ViewAllocProp< Parameters ... > base_prop_type ; + + typedef typename base_prop_type::allow_padding_t allow_padding_t ; + typedef typename base_prop_type::initialize_t initialize_t ; + typedef typename std::conditional< is_mem , Space , typename base_prop_type::memory_space >::type memory_space ; + typedef typename std::conditional< is_exec , Space , typename base_prop_type::execution_space >::type execution_space ; + + const std::string label ; + const memory_space memory ; + const execution_space execution ; + const allow_padding_t allow_padding ; + const initialize_t initialize ; + + // Templated so that 'base_prop_type( args ... ).execution' + // is not used unless arg_space == memory_space. + template< class ... Args > + ViewAllocProp( const memory_space & arg_space , Args ... args ) + : label( base_prop_type( args ... ).label ) + , memory( arg_space ) + , execution( base_prop_type( args ... ).execution ) + , allow_padding() + , initialize() + {} + + // Templated so that 'base_prop_type( args ... ).memory' + // is not used unless arg_space == execution_space. + template< class ... Args > + ViewAllocProp( const execution_space & arg_space , Args ... args ) + : label( base_prop_type( args ... ).label ) + , memory( base_prop_type( args ... ).memory ) + , execution( arg_space ) + , allow_padding() + , initialize() + {} +}; + +template< class ExecSpace , class MemSpace > +struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace > , std::string > +{ + typedef ViewAllocProp<> base_prop_type ; + + typedef typename base_prop_type::allow_padding_t allow_padding_t ; + typedef typename base_prop_type::initialize_t initialize_t ; + typedef MemSpace memory_space ; + typedef ExecSpace execution_space ; + + const std::string label ; + const memory_space memory ; + const execution_space execution ; + const allow_padding_t allow_padding ; + const initialize_t initialize ; + + ViewAllocProp( const std::string & arg_label ) + : label( arg_label ) + , memory() + , execution() + , allow_padding() + , initialize() + {} +}; + +template< class ExecSpace , class MemSpace , unsigned N > +struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace > , char[N] > +{ + typedef ViewAllocProp<> base_prop_type ; + + typedef typename base_prop_type::allow_padding_t allow_padding_t ; + typedef typename base_prop_type::initialize_t initialize_t ; + typedef MemSpace memory_space ; + typedef ExecSpace execution_space ; + + const std::string label ; + const memory_space memory ; + const execution_space execution ; + const allow_padding_t allow_padding ; + const initialize_t initialize ; + + ViewAllocProp( const char * const arg_label ) + : label( arg_label ) + , memory() + , execution() + , allow_padding() + , initialize() + {} +}; + + +// Deprecate in favor of view_alloc( Kokkos::WithoutInitializing ) +template< class ExecSpace , class MemSpace > +struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace > + , Kokkos::ViewAllocateWithoutInitializing + > +{ + typedef ViewAllocProp<> base_prop_type ; + + typedef typename base_prop_type::allow_padding_t allow_padding_t ; + typedef std::false_type initialize_t ; + typedef MemSpace memory_space ; + typedef ExecSpace execution_space ; + + const std::string label ; + const memory_space memory ; + const execution_space execution ; + const allow_padding_t allow_padding ; + const initialize_t initialize ; + + ViewAllocProp( const Kokkos::ViewAllocateWithoutInitializing & arg ) + : label( arg.label ) + , memory() + , execution() + , allow_padding() + , initialize() + {} +}; + +template< class ExecSpace , class MemSpace , class ... Parameters > +struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace > + , ViewAllocProp< Parameters ... > + > +{ + typedef ViewAllocProp< Parameters ... > base_prop_type ; + + typedef typename base_prop_type::allow_padding_t allow_padding_t ; + typedef typename base_prop_type::initialize_t initialize_t ; + typedef MemSpace memory_space ; + + typedef + typename std::conditional + < Kokkos::Impl::is_execution_space< typename base_prop_type::execution_space >::value + , typename base_prop_type::execution_space + , ExecSpace + >::type execution_space ; + + static_assert( std::is_same< typename base_prop_type::memory_space , ViewAllocProp<>::NullSpace >::value || + std::is_same< typename base_prop_type::memory_space , memory_space >::value + , "View allocation given incompatible memory space" ); + + static_assert( Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename execution_space::memory_space + , memory_space >::value + , "View allocation given incompatible execution space" ); + + const std::string label ; + const memory_space memory ; + const execution_space execution ; + const allow_padding_t allow_padding ; + const initialize_t initialize ; + + // If the input properties have a memory or execution space then copy construct those spaces + // otherwise default construct those spaces. + + template< class P > + ViewAllocProp( const P & arg_prop + , typename std::enable_if + < std::is_same< P , base_prop_type >::value && + Kokkos::Impl::is_memory_space< typename P::memory_space >::value && + Kokkos::Impl::is_execution_space< typename P::memory_space >::value + >::type * = 0 ) + : label( arg_prop.label ) + , memory( arg_prop.memory ) + , execution( arg_prop.execution ) + , allow_padding() + , initialize() + {} + + template< class P > + ViewAllocProp( const P & arg_prop + , typename std::enable_if + < std::is_same< P , base_prop_type >::value && + Kokkos::Impl::is_memory_space< typename P::memory_space >::value && + ! Kokkos::Impl::is_execution_space< typename P::execution_space >::value + >::type * = 0 ) + : label( arg_prop.label ) + , memory( arg_prop.memory ) + , execution() + , allow_padding() + , initialize() + {} + + template< class P > + ViewAllocProp( const P & arg_prop + , typename std::enable_if + < std::is_same< P , base_prop_type >::value && + ! Kokkos::Impl::is_memory_space< typename P::memory_space >::value && + Kokkos::Impl::is_execution_space< typename P::execution_space >::value + >::type * = 0 ) + : label( arg_prop.label ) + , memory() + , execution( arg_prop.execution ) + , allow_padding() + , initialize() + {} + + template< class P > + ViewAllocProp( const P & arg_prop + , typename std::enable_if + < std::is_same< P , base_prop_type >::value && + ! Kokkos::Impl::is_memory_space< typename P::memory_space >::value && + ! Kokkos::Impl::is_execution_space< typename P::execution_space >::value + >::type * = 0 ) + : label( arg_prop.label ) + , memory() + , execution() + , allow_padding() + , initialize() + {} +}; + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif + diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp new file mode 100755 index 0000000000000000000000000000000000000000..bd2b4c675bd332f8eeea85bf52582c2e90fd02a8 --- /dev/null +++ b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp @@ -0,0 +1,2683 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP +#define KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP + +#include <type_traits> +#include <initializer_list> + +#include <Kokkos_Pair.hpp> +#include <Kokkos_Layout.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Atomic_View.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ExecPolicy > class ParallelFor ; + +}} /* namespace Kokkos::Impl */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template< long sN0 = -1 + , long sN1 = -1 + , long sN2 = -1 + , long sN3 = -1 + , long sN4 = -1 + , long sN5 = -1 + , long sN6 = -1 + , long sN7 = -1 + > +struct ViewDimension { + + enum { rank = ( sN0 < 0 ? 0 : + ( sN1 < 0 ? 1 : + ( sN2 < 0 ? 2 : + ( sN3 < 0 ? 3 : + ( sN4 < 0 ? 4 : + ( sN5 < 0 ? 5 : + ( sN6 < 0 ? 6 : + ( sN7 < 0 ? 7 : 8 )))))))) }; + enum { rank_dynamic = 0 }; + + enum { N0 = 0 < sN0 ? sN0 : 1 }; + enum { N1 = 0 < sN1 ? sN1 : 1 }; + enum { N2 = 0 < sN2 ? sN2 : 1 }; + enum { N3 = 0 < sN3 ? sN3 : 1 }; + enum { N4 = 0 < sN4 ? sN4 : 1 }; + enum { N5 = 0 < sN5 ? sN5 : 1 }; + enum { N6 = 0 < sN6 ? sN6 : 1 }; + enum { N7 = 0 < sN7 ? sN7 : 1 }; + + ViewDimension() = default ; + ViewDimension( const ViewDimension & ) = default ; + ViewDimension & operator = ( const ViewDimension & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension( size_t , unsigned , unsigned , unsigned + , unsigned , unsigned , unsigned , unsigned ) {} +}; + +template< long sN1 + , long sN2 + , long sN3 + , long sN4 + , long sN5 + , long sN6 + , long sN7 + > +struct ViewDimension< 0, sN1, sN2, sN3, sN4, sN5, sN6, sN7 > { + + enum { rank = ( sN1 < 0 ? 1 : + ( sN2 < 0 ? 2 : + ( sN3 < 0 ? 3 : + ( sN4 < 0 ? 4 : + ( sN5 < 0 ? 5 : + ( sN6 < 0 ? 6 : + ( sN7 < 0 ? 7 : 8 ))))))) }; + enum { rank_dynamic = 1 }; + + size_t N0 ; /* When 1 == rank_dynamic allow N0 >= 2^32 */ + enum { N1 = 0 < sN1 ? sN1 : 1 }; + enum { N2 = 0 < sN2 ? sN2 : 1 }; + enum { N3 = 0 < sN3 ? sN3 : 1 }; + enum { N4 = 0 < sN4 ? sN4 : 1 }; + enum { N5 = 0 < sN5 ? sN5 : 1 }; + enum { N6 = 0 < sN6 ? sN6 : 1 }; + enum { N7 = 0 < sN7 ? sN7 : 1 }; + + ViewDimension() = default ; + ViewDimension( const ViewDimension & ) = default ; + ViewDimension & operator = ( const ViewDimension & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension( size_t aN0 , unsigned , unsigned , unsigned + , unsigned , unsigned , unsigned , unsigned ) + : N0( aN0 ) {} +}; + +template< long sN2 + , long sN3 + , long sN4 + , long sN5 + , long sN6 + , long sN7 + > +struct ViewDimension< 0, 0, sN2, sN3, sN4, sN5, sN6, sN7 > { + + enum { rank = ( sN2 < 0 ? 2 : + ( sN3 < 0 ? 3 : + ( sN4 < 0 ? 4 : + ( sN5 < 0 ? 5 : + ( sN6 < 0 ? 6 : + ( sN7 < 0 ? 7 : 8 )))))) }; + enum { rank_dynamic = 2 }; + + size_t N0 ; /* When 2 == rank_dynamic allow N0 >= 2^32 */ + size_t N1 ; /* When 2 == rank_dynamic allow N1 >= 2^32 */ + enum { N2 = 0 < sN2 ? sN2 : 1 }; + enum { N3 = 0 < sN3 ? sN3 : 1 }; + enum { N4 = 0 < sN4 ? sN4 : 1 }; + enum { N5 = 0 < sN5 ? sN5 : 1 }; + enum { N6 = 0 < sN6 ? sN6 : 1 }; + enum { N7 = 0 < sN7 ? sN7 : 1 }; + + ViewDimension() = default ; + ViewDimension( const ViewDimension & ) = default ; + ViewDimension & operator = ( const ViewDimension & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension( size_t aN0 , unsigned aN1 , unsigned , unsigned + , unsigned , unsigned , unsigned , unsigned ) + : N0( aN0 ) , N1( aN1 ) {} +}; + +template< long sN3 + , long sN4 + , long sN5 + , long sN6 + , long sN7 + > +struct ViewDimension< 0, 0, 0, sN3, sN4, sN5, sN6, sN7 > { + + enum { rank = ( sN3 < 0 ? 3 : + ( sN4 < 0 ? 4 : + ( sN5 < 0 ? 5 : + ( sN6 < 0 ? 6 : + ( sN7 < 0 ? 7 : 8 ))))) }; + enum { rank_dynamic = 3 }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + enum { N3 = 0 < sN3 ? sN3 : 1 }; + enum { N4 = 0 < sN4 ? sN4 : 1 }; + enum { N5 = 0 < sN5 ? sN5 : 1 }; + enum { N6 = 0 < sN6 ? sN6 : 1 }; + enum { N7 = 0 < sN7 ? sN7 : 1 }; + + ViewDimension() = default ; + ViewDimension( const ViewDimension & ) = default ; + ViewDimension & operator = ( const ViewDimension & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension( size_t aN0 , unsigned aN1 , unsigned aN2 , unsigned + , unsigned , unsigned , unsigned , unsigned ) + : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) {} +}; + +template< long sN4 + , long sN5 + , long sN6 + , long sN7 + > +struct ViewDimension< 0, 0, 0, 0, sN4, sN5, sN6, sN7 > { + + enum { rank = ( sN4 < 0 ? 4 : + ( sN5 < 0 ? 5 : + ( sN6 < 0 ? 6 : + ( sN7 < 0 ? 7 : 8 )))) }; + enum { rank_dynamic = 4 }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + unsigned N3 ; + enum { N4 = 0 < sN4 ? sN4 : 1 }; + enum { N5 = 0 < sN5 ? sN5 : 1 }; + enum { N6 = 0 < sN6 ? sN6 : 1 }; + enum { N7 = 0 < sN7 ? sN7 : 1 }; + + ViewDimension() = default ; + ViewDimension( const ViewDimension & ) = default ; + ViewDimension & operator = ( const ViewDimension & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension( size_t aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3 + , unsigned , unsigned , unsigned , unsigned ) + : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) , N3( aN3 ) {} +}; + +template< long sN5 + , long sN6 + , long sN7 + > +struct ViewDimension< 0, 0, 0, 0, 0, sN5, sN6, sN7 > { + + enum { rank = ( sN5 < 0 ? 5 : + ( sN6 < 0 ? 6 : + ( sN7 < 0 ? 7 : 8 ))) }; + enum { rank_dynamic = 5 }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + unsigned N3 ; + unsigned N4 ; + enum { N5 = 0 < sN5 ? sN5 : 1 }; + enum { N6 = 0 < sN6 ? sN6 : 1 }; + enum { N7 = 0 < sN7 ? sN7 : 1 }; + + ViewDimension() = default ; + ViewDimension( const ViewDimension & ) = default ; + ViewDimension & operator = ( const ViewDimension & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension( size_t aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3 + , unsigned aN4 , unsigned , unsigned , unsigned ) + : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) , N3( aN3 ) , N4( aN4 ) {} +}; + +template< long sN6 + , long sN7 + > +struct ViewDimension< 0, 0, 0, 0, 0, 0, sN6, sN7 > { + + enum { rank = ( sN6 < 0 ? 6 : + ( sN7 < 0 ? 7 : 8 )) }; + enum { rank_dynamic = 6 }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + unsigned N3 ; + unsigned N4 ; + unsigned N5 ; + enum { N6 = 0 < sN6 ? sN6 : 1 }; + enum { N7 = 0 < sN7 ? sN7 : 1 }; + + ViewDimension() = default ; + ViewDimension( const ViewDimension & ) = default ; + ViewDimension & operator = ( const ViewDimension & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension( size_t aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3 + , unsigned aN4 , unsigned aN5 , unsigned , unsigned ) + : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) , N3( aN3 ) , N4( aN4 ) , N5( aN5 ) {} +}; + +template< long sN7 > +struct ViewDimension< 0, 0, 0, 0, 0, 0, 0, sN7 > { + + enum { rank = ( sN7 < 0 ? 7 : 8 ) }; + enum { rank_dynamic = 7 }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + unsigned N3 ; + unsigned N4 ; + unsigned N5 ; + unsigned N6 ; + enum { N7 = 0 < sN7 ? sN7 : 1 }; + + ViewDimension() = default ; + ViewDimension( const ViewDimension & ) = default ; + ViewDimension & operator = ( const ViewDimension & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension( size_t aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3 + , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned ) + : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) , N3( aN3 ) , N4( aN4 ) , N5( aN5 ) , N6( aN6 ) {} +}; + +template<> +struct ViewDimension< 0, 0, 0, 0, 0, 0, 0, 0 > { + + enum { rank = 8 }; + enum { rank_dynamic = 8 }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + unsigned N3 ; + unsigned N4 ; + unsigned N5 ; + unsigned N6 ; + unsigned N7 ; + + ViewDimension() = default ; + ViewDimension( const ViewDimension & ) = default ; + ViewDimension & operator = ( const ViewDimension & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension( size_t aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3 + , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned aN7 ) + : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) , N3( aN3 ) , N4( aN4 ) , N5( aN5 ) , N6( aN6 ) , N7( aN7 ) {} +}; + +//---------------------------------------------------------------------------- + +template< class DstDim , class SrcDim > +struct ViewDimensionAssignable ; + +template< long dN0 , long dN1 , long dN2 , long dN3 , long dN4 , long dN5 , long dN6 , long dN7 + , long sN0 , long sN1 , long sN2 , long sN3 , long sN4 , long sN5 , long sN6 , long sN7 > +struct ViewDimensionAssignable< ViewDimension<dN0,dN1,dN2,dN3,dN4,dN5,dN6,dN7> + , ViewDimension<sN0,sN1,sN2,sN3,sN4,sN5,sN6,sN7> > +{ + typedef ViewDimension<dN0,dN1,dN2,dN3,dN4,dN5,dN6,dN7> dst ; + typedef ViewDimension<sN0,sN1,sN2,sN3,sN4,sN5,sN6,sN7> src ; + + enum { value = dst::rank == src::rank && + dst::rank_dynamic >= src::rank_dynamic && + ( 0 < dst::rank_dynamic || dN0 == sN0 ) && + ( 1 < dst::rank_dynamic || dN1 == sN1 ) && + ( 2 < dst::rank_dynamic || dN2 == sN2 ) && + ( 3 < dst::rank_dynamic || dN3 == sN3 ) && + ( 4 < dst::rank_dynamic || dN4 == sN4 ) && + ( 5 < dst::rank_dynamic || dN5 == sN5 ) && + ( 6 < dst::rank_dynamic || dN6 == sN6 ) && + ( 7 < dst::rank_dynamic || dN7 == sN7 ) }; +}; + +//---------------------------------------------------------------------------- + +template< class Dim , unsigned N , unsigned R = Dim::rank_dynamic > +struct ViewDimensionInsert ; + +template< class Dim , unsigned N > +struct ViewDimensionInsert< Dim , N , 0 > +{ + typedef ViewDimension< N + , 0 < Dim::rank ? Dim::N0 : -1 + , 1 < Dim::rank ? Dim::N1 : -1 + , 2 < Dim::rank ? Dim::N2 : -1 + , 3 < Dim::rank ? Dim::N3 : -1 + , 4 < Dim::rank ? Dim::N4 : -1 + , 5 < Dim::rank ? Dim::N5 : -1 + , 6 < Dim::rank ? Dim::N6 : -1 + > type ; +}; + +template< class Dim , unsigned N > +struct ViewDimensionInsert< Dim , N , 1 > +{ + typedef ViewDimension< 0 , N + , 1 < Dim::rank ? Dim::N1 : -1 + , 2 < Dim::rank ? Dim::N2 : -1 + , 3 < Dim::rank ? Dim::N3 : -1 + , 4 < Dim::rank ? Dim::N4 : -1 + , 5 < Dim::rank ? Dim::N5 : -1 + , 6 < Dim::rank ? Dim::N6 : -1 + > type ; +}; + +template< class Dim , unsigned N > +struct ViewDimensionInsert< Dim , N , 2 > +{ + typedef ViewDimension< 0 , 0 , N + , 2 < Dim::rank ? Dim::N2 : -1 + , 3 < Dim::rank ? Dim::N3 : -1 + , 4 < Dim::rank ? Dim::N4 : -1 + , 5 < Dim::rank ? Dim::N5 : -1 + , 6 < Dim::rank ? Dim::N6 : -1 + > type ; +}; + +template< class Dim , unsigned N > +struct ViewDimensionInsert< Dim , N , 3 > +{ + typedef ViewDimension< 0 , 0 , 0 , N + , 3 < Dim::rank ? Dim::N3 : -1 + , 4 < Dim::rank ? Dim::N4 : -1 + , 5 < Dim::rank ? Dim::N5 : -1 + , 6 < Dim::rank ? Dim::N6 : -1 + > type ; +}; + +template< class Dim , unsigned N > +struct ViewDimensionInsert< Dim , N , 4 > +{ + typedef ViewDimension< 0 , 0 , 0 , 0 , N + , 4 < Dim::rank ? Dim::N4 : -1 + , 5 < Dim::rank ? Dim::N5 : -1 + , 6 < Dim::rank ? Dim::N6 : -1 + > type ; +}; + +template< class Dim , unsigned N > +struct ViewDimensionInsert< Dim , N , 5 > +{ + typedef ViewDimension< 0 , 0 , 0 , 0 , 0 , N + , 5 < Dim::rank ? Dim::N5 : -1 + , 6 < Dim::rank ? Dim::N6 : -1 + > type ; +}; + +template< class Dim , unsigned N > +struct ViewDimensionInsert< Dim , N , 6 > +{ + typedef ViewDimension< 0 , 0 , 0 , 0 , 0 , 0 , N + , 6 < Dim::rank ? Dim::N6 : -1 + > type ; +}; + +template< class Dim , unsigned N > +struct ViewDimensionInsert< Dim , N , 7 > +{ + typedef ViewDimension< 0 , 0 , 0 , 0 , 0 , 0 , 0 , N > type ; +}; + +}}} // namespace Kokkos::Experimental::Impl + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +/** \brief Analyze the array dimensions defined by a Kokkos::View data type. + * + * It is presumed that the data type can be mapped down to a multidimensional + * array of an intrinsic scalar numerical type (double, float, int, ... ). + * The 'value_type' of an array may be an embedded aggregate type such + * as a fixed length array 'Array<T,N>'. + * In this case the 'array_intrinsic_type' represents the + * underlying array of intrinsic scalar numerical type. + * + * The embedded aggregate type must have an AnalyzeShape specialization + * to map it down to a shape and intrinsic scalar numerical type. + */ +template< class T > +struct ViewDataAnalysis +{ + typedef void specialize ; // No specialization + + typedef ViewDimension<> dimension ; + + typedef T type ; + typedef T value_type ; + typedef T array_scalar_type ; + + typedef typename std::add_const< T >::type const_type ; + typedef typename std::add_const< T >::type const_value_type ; + typedef typename std::add_const< T >::type const_array_scalar_type ; + + typedef typename std::remove_const< T >::type non_const_type ; + typedef typename std::remove_const< T >::type non_const_value_type ; + typedef typename std::remove_const< T >::type non_const_array_scalar_type ; +}; + +template< class T > +struct ViewDataAnalysis< T * > +{ +private: + + typedef ViewDataAnalysis< T > nested ; + +public: + + typedef typename nested::specialize specialize ; + + typedef typename ViewDimensionInsert< typename nested::dimension , 0 >::type dimension ; + + typedef typename nested::type * type ; + typedef typename nested::value_type value_type ; + typedef typename nested::array_scalar_type * array_scalar_type ; + + typedef typename nested::const_type * const_type ; + typedef typename nested::const_value_type const_value_type ; + typedef typename nested::const_array_scalar_type * const_array_scalar_type ; + + typedef typename nested::non_const_type * non_const_type ; + typedef typename nested::non_const_value_type non_const_value_type ; + typedef typename nested::non_const_array_scalar_type * non_const_array_scalar_type ; +}; + +template< class T > +struct ViewDataAnalysis< T [] > +{ +private: + + typedef ViewDataAnalysis< T > nested ; + +public: + + typedef typename nested::specialize specialize ; + + typedef typename ViewDimensionInsert< typename nested::dimension , 0 >::type dimension ; + + typedef typename nested::type type [] ; + typedef typename nested::value_type value_type ; + typedef typename nested::array_scalar_type array_scalar_type [] ; + + typedef typename nested::const_type const_type [] ; + typedef typename nested::const_value_type const_value_type ; + typedef typename nested::const_array_scalar_type const_array_scalar_type [] ; + + typedef typename nested::non_const_type non_const_type [] ; + typedef typename nested::non_const_value_type non_const_value_type ; + typedef typename nested::non_const_array_scalar_type non_const_array_scalar_type [] ; +}; + +template< class T , unsigned N > +struct ViewDataAnalysis< T[N] > +{ +private: + + typedef ViewDataAnalysis< T > nested ; + +public: + + typedef typename nested::specialize specialize ; + + typedef typename ViewDimensionInsert< typename nested::dimension , N >::type dimension ; + + typedef typename nested::type type [N] ; + typedef typename nested::value_type value_type ; + typedef typename nested::array_scalar_type array_scalar_type [N] ; + + typedef typename nested::const_type const_type [N] ; + typedef typename nested::const_value_type const_value_type ; + typedef typename nested::const_array_scalar_type const_array_scalar_type [N] ; + + typedef typename nested::non_const_type non_const_type [N] ; + typedef typename nested::non_const_value_type non_const_value_type ; + typedef typename nested::non_const_array_scalar_type non_const_array_scalar_type [N] ; +}; + +}}} // namespace Kokkos::Experimental::Impl + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template < class Dimension , class Layout , typename Enable = void > +struct ViewOffset ; + +//---------------------------------------------------------------------------- +// LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding +template < class Dimension > +struct ViewOffset< Dimension , Kokkos::LayoutLeft + , typename std::enable_if<( 1 >= Dimension::rank + || + 0 == Dimension::rank_dynamic + )>::type > +{ + typedef size_t size_type ; + typedef Dimension dimension_type ; + typedef Kokkos::LayoutLeft array_layout ; + + dimension_type m_dim ; + + //---------------------------------------- + + // rank 1 + template< typename I0 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0 ) const { return i0 ; } + + // rank 2 + template < typename I0 , typename I1 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0 , I1 const & i1 ) const + { return i0 + m_dim.N0 * i1 ; } + + //rank 3 + template < typename I0, typename I1, typename I2 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const + { + return i0 + m_dim.N0 * ( i1 + m_dim.N1 * i2 ); + } + + //rank 4 + template < typename I0, typename I1, typename I2, typename I3 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const + { + return i0 + m_dim.N0 * ( + i1 + m_dim.N1 * ( + i2 + m_dim.N2 * i3 )); + } + + //rank 5 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4 ) const + { + return i0 + m_dim.N0 * ( + i1 + m_dim.N1 * ( + i2 + m_dim.N2 * ( + i3 + m_dim.N3 * i4 ))); + } + + //rank 6 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5 ) const + { + return i0 + m_dim.N0 * ( + i1 + m_dim.N1 * ( + i2 + m_dim.N2 * ( + i3 + m_dim.N3 * ( + i4 + m_dim.N4 * i5 )))); + } + + //rank 7 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5, typename I6 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5, I6 const & i6 ) const + { + return i0 + m_dim.N0 * ( + i1 + m_dim.N1 * ( + i2 + m_dim.N2 * ( + i3 + m_dim.N3 * ( + i4 + m_dim.N4 * ( + i5 + m_dim.N5 * i6 ))))); + } + + //rank 8 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5, typename I6, typename I7 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const + { + return i0 + m_dim.N0 * ( + i1 + m_dim.N1 * ( + i2 + m_dim.N2 * ( + i3 + m_dim.N3 * ( + i4 + m_dim.N4 * ( + i5 + m_dim.N5 * ( + i6 + m_dim.N6 * i7 )))))); + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; } + + /* Cardinality of the domain index space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type size() const + { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; } + + /* Span of the range space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type span() const + { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return true ; } + + /* Strides of dimensions */ + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N0 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N0 * m_dim.N1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 ; } + + // Stride with [ rank ] value is the total length + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { + s[0] = 1 ; + if ( 0 < dimension_type::rank ) { s[1] = m_dim.N0 ; } + if ( 1 < dimension_type::rank ) { s[2] = s[1] * m_dim.N1 ; } + if ( 2 < dimension_type::rank ) { s[3] = s[2] * m_dim.N2 ; } + if ( 3 < dimension_type::rank ) { s[4] = s[3] * m_dim.N3 ; } + if ( 4 < dimension_type::rank ) { s[5] = s[4] * m_dim.N4 ; } + if ( 5 < dimension_type::rank ) { s[6] = s[5] * m_dim.N5 ; } + if ( 6 < dimension_type::rank ) { s[7] = s[6] * m_dim.N6 ; } + if ( 7 < dimension_type::rank ) { s[8] = s[7] * m_dim.N7 ; } + } + + //---------------------------------------- + + ViewOffset() = default ; + ViewOffset( const ViewOffset & ) = default ; + ViewOffset & operator = ( const ViewOffset & ) = default ; + + template< unsigned TrivialScalarSize > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( std::integral_constant<unsigned,TrivialScalarSize> const & + , size_t aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3 + , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned aN7 ) + : m_dim( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + {} + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ) + : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 + , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 ) + { + static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" ); + // Also requires equal static dimensions ... + } + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs ) + : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 ) + { + static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1 + , "ViewOffset LayoutLeft and LayoutRight are only compatible when rank == 1" ); + } + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs ) + : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 ) + { + static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1 + , "ViewOffset LayoutLeft and LayoutStride are only compatible when rank == 1" ); + if ( rhs.m_stride.S0 != 1 ) { + Kokkos::abort("Kokkos::Experimental::ViewOffset assignment of LayoutLeft from LayoutStride requires stride == 1" ); + } + } + + //---------------------------------------- + // Subview construction + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs + , const size_t n0 + , const size_t + , const size_t + , const size_t + , const size_t + , const size_t + , const size_t + , const size_t + ) + : m_dim( n0, 0, 0, 0, 0, 0, 0, 0 ) + { + static_assert( ( 0 == dimension_type::rank ) || + ( 1 == dimension_type::rank && 1 == dimension_type::rank_dynamic && 1 <= DimRHS::rank ) + , "ViewOffset subview construction requires compatible rank" ); + } +}; + +//---------------------------------------------------------------------------- +// LayoutLeft AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding +template < class Dimension > +struct ViewOffset< Dimension , Kokkos::LayoutLeft + , typename std::enable_if<( 1 < Dimension::rank + && + 0 < Dimension::rank_dynamic + )>::type > +{ + typedef size_t size_type ; + typedef Dimension dimension_type ; + typedef Kokkos::LayoutLeft array_layout ; + + dimension_type m_dim ; + size_type m_stride ; + + //---------------------------------------- + + // rank 1 + template< typename I0 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0 ) const { return i0 ; } + + // rank 2 + template < typename I0 , typename I1 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0 , I1 const & i1 ) const + { return i0 + m_stride * i1 ; } + + //rank 3 + template < typename I0, typename I1, typename I2 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const + { + return i0 + m_stride * ( i1 + m_dim.N1 * i2 ); + } + + //rank 4 + template < typename I0, typename I1, typename I2, typename I3 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const + { + return i0 + m_stride * ( + i1 + m_dim.N1 * ( + i2 + m_dim.N2 * i3 )); + } + + //rank 5 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4 ) const + { + return i0 + m_stride * ( + i1 + m_dim.N1 * ( + i2 + m_dim.N2 * ( + i3 + m_dim.N3 * i4 ))); + } + + //rank 6 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5 ) const + { + return i0 + m_stride * ( + i1 + m_dim.N1 * ( + i2 + m_dim.N2 * ( + i3 + m_dim.N3 * ( + i4 + m_dim.N4 * i5 )))); + } + + //rank 7 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5, typename I6 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5, I6 const & i6 ) const + { + return i0 + m_stride * ( + i1 + m_dim.N1 * ( + i2 + m_dim.N2 * ( + i3 + m_dim.N3 * ( + i4 + m_dim.N4 * ( + i5 + m_dim.N5 * i6 ))))); + } + + //rank 8 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5, typename I6, typename I7 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const + { + return i0 + m_stride * ( + i1 + m_dim.N1 * ( + i2 + m_dim.N2 * ( + i3 + m_dim.N3 * ( + i4 + m_dim.N4 * ( + i5 + m_dim.N5 * ( + i6 + m_dim.N6 * i7 )))))); + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; } + + /* Cardinality of the domain index space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type size() const + { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; } + + /* Span of the range space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type span() const + { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_stride == m_dim.N0 ; } + + /* Strides of dimensions */ + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_stride ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_stride * m_dim.N1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_stride * m_dim.N1 * m_dim.N2 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 ; } + + // Stride with [ rank ] value is the total length + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { + s[0] = 1 ; + if ( 0 < dimension_type::rank ) { s[1] = m_stride ; } + if ( 1 < dimension_type::rank ) { s[2] = s[1] * m_dim.N1 ; } + if ( 2 < dimension_type::rank ) { s[3] = s[2] * m_dim.N2 ; } + if ( 3 < dimension_type::rank ) { s[4] = s[3] * m_dim.N3 ; } + if ( 4 < dimension_type::rank ) { s[5] = s[4] * m_dim.N4 ; } + if ( 5 < dimension_type::rank ) { s[6] = s[5] * m_dim.N5 ; } + if ( 6 < dimension_type::rank ) { s[7] = s[6] * m_dim.N6 ; } + if ( 7 < dimension_type::rank ) { s[8] = s[7] * m_dim.N7 ; } + } + + //---------------------------------------- + +private: + + template< unsigned TrivialScalarSize > + struct Padding { + enum { div = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT / ( TrivialScalarSize ? TrivialScalarSize : 1 ) }; + enum { mod = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT % ( TrivialScalarSize ? TrivialScalarSize : 1 ) }; + + // If memory alignment is a multiple of the trivial scalar size then attempt to align. + enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 }; + enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr + + KOKKOS_INLINE_FUNCTION + static constexpr size_t stride( size_t const N ) + { + return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) ) + ? N + align - ( N % div_ok ) : N ; + } + }; + +public: + + ViewOffset() = default ; + ViewOffset( const ViewOffset & ) = default ; + ViewOffset & operator = ( const ViewOffset & ) = default ; + + /* Enable padding for trivial scalar types with non-zero trivial scalar size */ + template< unsigned TrivialScalarSize > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size + , size_t aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3 + , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned aN7 ) + : m_dim( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + , m_stride( Padding<TrivialScalarSize>::stride( aN0 ) ) + {} + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ) + : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 + , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 ) + , m_stride( rhs.stride_1() ) + { + static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" ); + // Also requires equal static dimensions ... + } + + //---------------------------------------- + // Subview construction + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs + , const size_t aN0 + , const size_t aN1 + , const size_t aN2 + , const size_t aN3 + , const size_t aN4 + , const size_t aN5 + , const size_t aN6 + , const size_t aN7 + ) + : m_dim( aN0 + , ( 1 < DimRHS::rank && aN1 ? aN1 : + ( 2 < DimRHS::rank && aN2 ? aN2 : + ( 3 < DimRHS::rank && aN3 ? aN3 : + ( 4 < DimRHS::rank && aN4 ? aN4 : + ( 5 < DimRHS::rank && aN5 ? aN5 : + ( 6 < DimRHS::rank && aN6 ? aN6 : + ( 7 < DimRHS::rank && aN7 ? aN7 : 0 ))))))) + , 0, 0, 0, 0, 0, 0 ) + , m_stride( ( 1 < DimRHS::rank && aN1 ? rhs.stride_1() : + ( 2 < DimRHS::rank && aN2 ? rhs.stride_2() : + ( 3 < DimRHS::rank && aN3 ? rhs.stride_3() : + ( 4 < DimRHS::rank && aN4 ? rhs.stride_4() : + ( 5 < DimRHS::rank && aN5 ? rhs.stride_5() : + ( 6 < DimRHS::rank && aN6 ? rhs.stride_6() : + ( 7 < DimRHS::rank && aN7 ? rhs.stride_7() : 0 ))))))) ) + { + // This subview must be 2 == rank and 2 == rank_dynamic + // due to only having stride #0. + // The source dimension #0 must be non-zero for stride-one leading dimension. + // At most subsequent dimension can be non-zero. + + static_assert( ( 2 == dimension_type::rank ) && + ( 2 == dimension_type::rank_dynamic ) && + ( 2 <= DimRHS::rank ) + , "ViewOffset subview construction requires compatible rank" ); + } +}; + +//---------------------------------------------------------------------------- +// LayoutRight AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding +template < class Dimension > +struct ViewOffset< Dimension , Kokkos::LayoutRight + , typename std::enable_if<( 1 >= Dimension::rank + || + 0 == Dimension::rank_dynamic + )>::type > +{ + typedef size_t size_type ; + typedef Dimension dimension_type ; + typedef Kokkos::LayoutRight array_layout ; + + dimension_type m_dim ; + + //---------------------------------------- + + // rank 1 + template< typename I0 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0 ) const { return i0 ; } + + // rank 2 + template < typename I0 , typename I1 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0 , I1 const & i1 ) const + { return i1 + m_dim.N1 * i0 ; } + + //rank 3 + template < typename I0, typename I1, typename I2 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const + { + return i2 + m_dim.N2 * ( i1 + m_dim.N1 * ( i0 )); + } + + //rank 4 + template < typename I0, typename I1, typename I2, typename I3 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const + { + return i3 + m_dim.N3 * ( + i2 + m_dim.N2 * ( + i1 + m_dim.N1 * ( i0 ))); + } + + //rank 5 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4 ) const + { + return i4 + m_dim.N4 * ( + i3 + m_dim.N3 * ( + i2 + m_dim.N2 * ( + i1 + m_dim.N1 * ( i0 )))); + } + + //rank 6 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5 ) const + { + return i5 + m_dim.N5 * ( + i4 + m_dim.N4 * ( + i3 + m_dim.N3 * ( + i2 + m_dim.N2 * ( + i1 + m_dim.N1 * ( i0 ))))); + } + + //rank 7 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5, typename I6 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5, I6 const & i6 ) const + { + return i6 + m_dim.N6 * ( + i5 + m_dim.N5 * ( + i4 + m_dim.N4 * ( + i3 + m_dim.N3 * ( + i2 + m_dim.N2 * ( + i1 + m_dim.N1 * ( i0 )))))); + } + + //rank 8 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5, typename I6, typename I7 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const + { + return i7 + m_dim.N7 * ( + i6 + m_dim.N6 * ( + i5 + m_dim.N5 * ( + i4 + m_dim.N4 * ( + i3 + m_dim.N3 * ( + i2 + m_dim.N2 * ( + i1 + m_dim.N1 * ( i0 ))))))); + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; } + + /* Cardinality of the domain index space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type size() const + { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; } + + /* Span of the range space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type span() const + { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return true ; } + + /* Strides of dimensions */ + KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N7 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N7 * m_dim.N6 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1 ; } + + // Stride with [ rank ] value is the total length + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { + size_type n = 1 ; + if ( 7 < dimension_type::rank ) { s[7] = n ; n *= m_dim.N7 ; } + if ( 6 < dimension_type::rank ) { s[6] = n ; n *= m_dim.N6 ; } + if ( 5 < dimension_type::rank ) { s[5] = n ; n *= m_dim.N5 ; } + if ( 4 < dimension_type::rank ) { s[4] = n ; n *= m_dim.N4 ; } + if ( 3 < dimension_type::rank ) { s[3] = n ; n *= m_dim.N3 ; } + if ( 2 < dimension_type::rank ) { s[2] = n ; n *= m_dim.N2 ; } + if ( 1 < dimension_type::rank ) { s[1] = n ; n *= m_dim.N1 ; } + if ( 0 < dimension_type::rank ) { s[0] = n ; } + s[dimension_type::rank] = n * m_dim.N0 ; + } + + //---------------------------------------- + + ViewOffset() = default ; + ViewOffset( const ViewOffset & ) = default ; + ViewOffset & operator = ( const ViewOffset & ) = default ; + + template< unsigned TrivialScalarSize > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( std::integral_constant<unsigned,TrivialScalarSize> const & + , size_t aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3 + , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned aN7 ) + : m_dim( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + {} + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs ) + : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 + , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 ) + { + static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" ); + // Also requires equal static dimensions ... + } + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ) + : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 ) + { + static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1 + , "ViewOffset LayoutRight and LayoutLeft are only compatible when rank == 1" ); + } + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs ) + : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 ) + { + static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1 + , "ViewOffset LayoutLeft and LayoutStride are only compatible when rank == 1" ); + if ( rhs.m_stride.S0 != 1 ) { + Kokkos::abort("Kokkos::Experimental::ViewOffset assignment of LayoutRight from LayoutStride requires stride == 1" ); + } + } + + //---------------------------------------- + // Subview construction + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs + , const size_t n0 + , const size_t + , const size_t + , const size_t + , const size_t + , const size_t + , const size_t + , const size_t + ) + : m_dim( n0, 0, 0, 0, 0, 0, 0, 0 ) + { + static_assert( ( 0 == dimension_type::rank ) || + ( 1 == dimension_type::rank && 1 == dimension_type::rank_dynamic && 1 <= DimRHS::rank ) + , "ViewOffset subview construction requires compatible rank" ); + } +}; + +//---------------------------------------------------------------------------- +// LayoutRight AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding +template < class Dimension > +struct ViewOffset< Dimension , Kokkos::LayoutRight + , typename std::enable_if<( 1 < Dimension::rank + && + 0 < Dimension::rank_dynamic + )>::type > +{ + typedef size_t size_type ; + typedef Dimension dimension_type ; + typedef Kokkos::LayoutRight array_layout ; + + dimension_type m_dim ; + size_type m_stride ; + + //---------------------------------------- + + // rank 1 + template< typename I0 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0 ) const { return i0 ; } + + // rank 2 + template < typename I0 , typename I1 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0 , I1 const & i1 ) const + { return i1 + i0 * m_stride ; } + + //rank 3 + template < typename I0, typename I1, typename I2 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const + { return i2 + m_dim.N2 * ( i1 ) + i0 * m_stride ; } + + //rank 4 + template < typename I0, typename I1, typename I2, typename I3 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const + { + return i3 + m_dim.N3 * ( + i2 + m_dim.N2 * ( i1 )) + + i0 * m_stride ; + } + + //rank 5 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4 ) const + { + return i4 + m_dim.N4 * ( + i3 + m_dim.N3 * ( + i2 + m_dim.N2 * ( i1 ))) + + i0 * m_stride ; + } + + //rank 6 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5 ) const + { + return i5 + m_dim.N5 * ( + i4 + m_dim.N4 * ( + i3 + m_dim.N3 * ( + i2 + m_dim.N2 * ( i1 )))) + + i0 * m_stride ; + } + + //rank 7 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5, typename I6 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5, I6 const & i6 ) const + { + return i6 + m_dim.N6 * ( + i5 + m_dim.N5 * ( + i4 + m_dim.N4 * ( + i3 + m_dim.N3 * ( + i2 + m_dim.N2 * ( i1 ))))) + + i0 * m_stride ; + } + + //rank 8 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5, typename I6, typename I7 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const + { + return i7 + m_dim.N7 * ( + i6 + m_dim.N6 * ( + i5 + m_dim.N5 * ( + i4 + m_dim.N4 * ( + i3 + m_dim.N3 * ( + i2 + m_dim.N2 * ( i1 )))))) + + i0 * m_stride ; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; } + + /* Cardinality of the domain index space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type size() const + { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; } + + /* Span of the range space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type span() const + { return m_dim.N0 * m_stride ; } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const + { return m_stride == m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1 ; } + + /* Strides of dimensions */ + KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N7 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N7 * m_dim.N6 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride ; } + + // Stride with [ rank ] value is the total length + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { + size_type n = 1 ; + if ( 7 < dimension_type::rank ) { s[7] = n ; n *= m_dim.N7 ; } + if ( 6 < dimension_type::rank ) { s[6] = n ; n *= m_dim.N6 ; } + if ( 5 < dimension_type::rank ) { s[5] = n ; n *= m_dim.N5 ; } + if ( 4 < dimension_type::rank ) { s[4] = n ; n *= m_dim.N4 ; } + if ( 3 < dimension_type::rank ) { s[3] = n ; n *= m_dim.N3 ; } + if ( 2 < dimension_type::rank ) { s[2] = n ; n *= m_dim.N2 ; } + if ( 1 < dimension_type::rank ) { s[1] = n ; } + if ( 0 < dimension_type::rank ) { s[0] = m_stride ; } + s[dimension_type::rank] = m_stride * m_dim.N0 ; + } + + //---------------------------------------- + +private: + + template< unsigned TrivialScalarSize > + struct Padding { + enum { div = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT / ( TrivialScalarSize ? TrivialScalarSize : 1 ) }; + enum { mod = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT % ( TrivialScalarSize ? TrivialScalarSize : 1 ) }; + + // If memory alignment is a multiple of the trivial scalar size then attempt to align. + enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 }; + enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr + + KOKKOS_INLINE_FUNCTION + static constexpr size_t stride( size_t const N ) + { + return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) ) + ? N + align - ( N % div_ok ) : N ; + } + }; + +public: + + ViewOffset() = default ; + ViewOffset( const ViewOffset & ) = default ; + ViewOffset & operator = ( const ViewOffset & ) = default ; + + /* Enable padding for trivial scalar types with non-zero trivial scalar size. */ + template< unsigned TrivialScalarSize > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size + , size_t aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3 + , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned aN7 ) + : m_dim( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + , m_stride( Padding<TrivialScalarSize>:: + stride( /* 2 <= rank */ + m_dim.N1 * ( dimension_type::rank == 2 ? 1 : + m_dim.N2 * ( dimension_type::rank == 3 ? 1 : + m_dim.N3 * ( dimension_type::rank == 4 ? 1 : + m_dim.N4 * ( dimension_type::rank == 5 ? 1 : + m_dim.N5 * ( dimension_type::rank == 6 ? 1 : + m_dim.N6 * ( dimension_type::rank == 7 ? 1 : m_dim.N7 )))))) )) + {} + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ) + : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 + , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 ) + , m_stride( rhs.stride_0() ) + { + static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" ); + // Also requires equal static dimensions ... + } + + //---------------------------------------- + // Subview construction + // Last dimension must be non-zero + + template< class DimRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs + , const size_t aN0 + , const size_t aN1 + , const size_t aN2 + , const size_t aN3 + , const size_t aN4 + , const size_t aN5 + , const size_t aN6 + , const size_t aN7 + ) + : m_dim( // N0 == First non-zero dimension before the last dimension. + ( 1 < DimRHS::rank && aN0 ? aN0 : + ( 2 < DimRHS::rank && aN1 ? aN1 : + ( 3 < DimRHS::rank && aN2 ? aN2 : + ( 4 < DimRHS::rank && aN3 ? aN3 : + ( 5 < DimRHS::rank && aN4 ? aN4 : + ( 6 < DimRHS::rank && aN5 ? aN5 : + ( 7 < DimRHS::rank && aN6 ? aN6 : 0 ))))))) + , // N1 == Last dimension. + ( 2 == DimRHS::rank ? aN1 : + ( 3 == DimRHS::rank ? aN2 : + ( 4 == DimRHS::rank ? aN3 : + ( 5 == DimRHS::rank ? aN4 : + ( 6 == DimRHS::rank ? aN5 : + ( 7 == DimRHS::rank ? aN6 : aN7 )))))) + , 0, 0, 0, 0, 0, 0 ) + , m_stride( ( 1 < DimRHS::rank && aN0 ? rhs.stride_0() : + ( 2 < DimRHS::rank && aN1 ? rhs.stride_1() : + ( 3 < DimRHS::rank && aN2 ? rhs.stride_2() : + ( 4 < DimRHS::rank && aN3 ? rhs.stride_3() : + ( 5 < DimRHS::rank && aN4 ? rhs.stride_4() : + ( 6 < DimRHS::rank && aN5 ? rhs.stride_5() : + ( 7 < DimRHS::rank && aN6 ? rhs.stride_6() : 0 ))))))) ) + { + // This subview must be 2 == rank and 2 == rank_dynamic + // due to only having stride #0. + // The source dimension #0 must be non-zero for stride-one leading dimension. + // At most subsequent dimension can be non-zero. + + static_assert( ( 2 == dimension_type::rank ) && + ( 2 == dimension_type::rank_dynamic ) && + ( 2 <= DimRHS::rank ) + , "ViewOffset subview construction requires compatible rank" ); + } +}; + +//---------------------------------------------------------------------------- +/* Strided array layout only makes sense for 0 < rank */ + +template< unsigned Rank > +struct ViewStride ; + +template<> +struct ViewStride<1> { + size_t S0 ; + enum { S1 = 0 , S2 = 0 , S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 }; + + ViewStride() = default ; + ViewStride( const ViewStride & ) = default ; + ViewStride & operator = ( const ViewStride & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride( size_t aS0 , size_t , size_t , size_t + , size_t , size_t , size_t , size_t ) + : S0( aS0 ) + {} +}; + +template<> +struct ViewStride<2> { + size_t S0 , S1 ; + enum { S2 = 0 , S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 }; + + ViewStride() = default ; + ViewStride( const ViewStride & ) = default ; + ViewStride & operator = ( const ViewStride & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride( size_t aS0 , size_t aS1 , size_t , size_t + , size_t , size_t , size_t , size_t ) + : S0( aS0 ) , S1( aS1 ) + {} +}; + +template<> +struct ViewStride<3> { + size_t S0 , S1 , S2 ; + enum { S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 }; + + ViewStride() = default ; + ViewStride( const ViewStride & ) = default ; + ViewStride & operator = ( const ViewStride & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t + , size_t , size_t , size_t , size_t ) + : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) + {} +}; + +template<> +struct ViewStride<4> { + size_t S0 , S1 , S2 , S3 ; + enum { S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 }; + + ViewStride() = default ; + ViewStride( const ViewStride & ) = default ; + ViewStride & operator = ( const ViewStride & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3 + , size_t , size_t , size_t , size_t ) + : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 ) + {} +}; + +template<> +struct ViewStride<5> { + size_t S0 , S1 , S2 , S3 , S4 ; + enum { S5 = 0 , S6 = 0 , S7 = 0 }; + + ViewStride() = default ; + ViewStride( const ViewStride & ) = default ; + ViewStride & operator = ( const ViewStride & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3 + , size_t aS4 , size_t , size_t , size_t ) + : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 ) + , S4( aS4 ) + {} +}; + +template<> +struct ViewStride<6> { + size_t S0 , S1 , S2 , S3 , S4 , S5 ; + enum { S6 = 0 , S7 = 0 }; + + ViewStride() = default ; + ViewStride( const ViewStride & ) = default ; + ViewStride & operator = ( const ViewStride & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3 + , size_t aS4 , size_t aS5 , size_t , size_t ) + : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 ) + , S4( aS4 ) , S5( aS5 ) + {} +}; + +template<> +struct ViewStride<7> { + size_t S0 , S1 , S2 , S3 , S4 , S5 , S6 ; + enum { S7 = 0 }; + + ViewStride() = default ; + ViewStride( const ViewStride & ) = default ; + ViewStride & operator = ( const ViewStride & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3 + , size_t aS4 , size_t aS5 , size_t aS6 , size_t ) + : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 ) + , S4( aS4 ) , S5( aS5 ) , S6( aS6 ) + {} +}; + +template<> +struct ViewStride<8> { + size_t S0 , S1 , S2 , S3 , S4 , S5 , S6 , S7 ; + + ViewStride() = default ; + ViewStride( const ViewStride & ) = default ; + ViewStride & operator = ( const ViewStride & ) = default ; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3 + , size_t aS4 , size_t aS5 , size_t aS6 , size_t aS7 ) + : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 ) + , S4( aS4 ) , S5( aS5 ) , S6( aS6 ) , S7( aS7 ) + {} +}; + +template < class Dimension > +struct ViewOffset< Dimension , Kokkos::LayoutStride + , typename std::enable_if<( 0 < Dimension::rank )>::type > +{ +private: + typedef ViewStride< Dimension::rank > stride_type ; +public: + + typedef size_t size_type ; + typedef Dimension dimension_type ; + typedef Kokkos::LayoutStride array_layout ; + + dimension_type m_dim ; + stride_type m_stride ; + + //---------------------------------------- + + // rank 1 + template< typename I0 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0 ) const + { + return i0 * m_stride.S0 ; + } + + // rank 2 + template < typename I0 , typename I1 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0 , I1 const & i1 ) const + { + return i0 * m_stride.S0 + + i1 * m_stride.S1 ; + } + + //rank 3 + template < typename I0, typename I1, typename I2 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const + { + return i0 * m_stride.S0 + + i1 * m_stride.S1 + + i2 * m_stride.S2 ; + } + + //rank 4 + template < typename I0, typename I1, typename I2, typename I3 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const + { + return i0 * m_stride.S0 + + i1 * m_stride.S1 + + i2 * m_stride.S2 + + i3 * m_stride.S3 ; + } + + //rank 5 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4 ) const + { + return i0 * m_stride.S0 + + i1 * m_stride.S1 + + i2 * m_stride.S2 + + i3 * m_stride.S3 + + i4 * m_stride.S4 ; + } + + //rank 6 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5 ) const + { + return i0 * m_stride.S0 + + i1 * m_stride.S1 + + i2 * m_stride.S2 + + i3 * m_stride.S3 + + i4 * m_stride.S4 + + i5 * m_stride.S5 ; + } + + //rank 7 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5, typename I6 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5, I6 const & i6 ) const + { + return i0 * m_stride.S0 + + i1 * m_stride.S1 + + i2 * m_stride.S2 + + i3 * m_stride.S3 + + i4 * m_stride.S4 + + i5 * m_stride.S5 + + i6 * m_stride.S6 ; + } + + //rank 8 + template < typename I0, typename I1, typename I2, typename I3 + , typename I4, typename I5, typename I6, typename I7 > + KOKKOS_INLINE_FUNCTION constexpr + size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 + , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const + { + return i0 * m_stride.S0 + + i1 * m_stride.S1 + + i2 * m_stride.S2 + + i3 * m_stride.S3 + + i4 * m_stride.S4 + + i5 * m_stride.S5 + + i6 * m_stride.S6 + + i7 * m_stride.S7 ; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; } + + /* Cardinality of the domain index space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type size() const + { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; } + +private: + + KOKKOS_INLINE_FUNCTION + static constexpr size_type Max( size_type lhs , size_type rhs ) + { return lhs < rhs ? rhs : lhs ; } + +public: + + /* Span of the range space, largest stride * dimension */ + KOKKOS_INLINE_FUNCTION + constexpr size_type span() const + { + return Max( m_dim.N0 * m_stride.S0 , + Max( m_dim.N1 * m_stride.S1 , + Max( m_dim.N2 * m_stride.S2 , + Max( m_dim.N3 * m_stride.S3 , + Max( m_dim.N4 * m_stride.S4 , + Max( m_dim.N5 * m_stride.S5 , + Max( m_dim.N6 * m_stride.S6 , + m_dim.N7 * m_stride.S7 ))))))); + } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return span() == size(); } + + /* Strides of dimensions */ + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride.S0 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_stride.S1 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_stride.S2 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_stride.S3 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_stride.S4 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_stride.S5 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_stride.S6 ; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_stride.S7 ; } + + // Stride with [ rank ] value is the total length + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { + if ( 0 < dimension_type::rank ) { s[0] = m_stride.S0 ; } + if ( 1 < dimension_type::rank ) { s[1] = m_stride.S1 ; } + if ( 2 < dimension_type::rank ) { s[2] = m_stride.S2 ; } + if ( 3 < dimension_type::rank ) { s[3] = m_stride.S3 ; } + if ( 4 < dimension_type::rank ) { s[4] = m_stride.S4 ; } + if ( 5 < dimension_type::rank ) { s[5] = m_stride.S5 ; } + if ( 6 < dimension_type::rank ) { s[6] = m_stride.S6 ; } + if ( 7 < dimension_type::rank ) { s[7] = m_stride.S7 ; } + s[dimension_type::rank] = span(); + } + + //---------------------------------------- + + ViewOffset() = default ; + ViewOffset( const ViewOffset & ) = default ; + ViewOffset & operator = ( const ViewOffset & ) = default ; + + KOKKOS_INLINE_FUNCTION + ViewOffset( const Kokkos::LayoutStride & rhs ) + : m_dim( rhs.dimension[0] , rhs.dimension[1] , rhs.dimension[2] , rhs.dimension[3] + , rhs.dimension[4] , rhs.dimension[5] , rhs.dimension[6] , rhs.dimension[7] ) + , m_stride( rhs.stride[0] , rhs.stride[1] , rhs.stride[2] , rhs.stride[3] + , rhs.stride[4] , rhs.stride[5] , rhs.stride[6] , rhs.stride[7] ) + {} + + template< class DimRHS , class LayoutRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , LayoutRHS , void > & rhs ) + : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 + , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 ) + , m_stride( rhs.stride_0() , rhs.stride_1() , rhs.stride_2() , rhs.stride_3() + , rhs.stride_4() , rhs.stride_5() , rhs.stride_6() , rhs.stride_7() ) + { + static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" ); + // Also requires equal static dimensions ... + } + + //---------------------------------------- + // Subview construction + +private: + + KOKKOS_INLINE_FUNCTION + static constexpr unsigned + count_non_zero( const size_t aN0 = 0 + , const size_t aN1 = 0 + , const size_t aN2 = 0 + , const size_t aN3 = 0 + , const size_t aN4 = 0 + , const size_t aN5 = 0 + , const size_t aN6 = 0 + , const size_t aN7 = 0 + ) + { + return ( aN0 ? 1 : 0 ) + + ( aN1 ? 1 : 0 ) + + ( aN2 ? 1 : 0 ) + + ( aN3 ? 1 : 0 ) + + ( aN4 ? 1 : 0 ) + + ( aN5 ? 1 : 0 ) + + ( aN6 ? 1 : 0 ) + + ( aN7 ? 1 : 0 ); + } + + template< unsigned Rank , unsigned I > + KOKKOS_INLINE_FUNCTION + static constexpr size_t + get_non_zero( const size_t aN0 + , const size_t aN1 + , const size_t aN2 + , const size_t aN3 + , const size_t aN4 + , const size_t aN5 + , const size_t aN6 + , const size_t aN7 + ) + { + return ( 0 < Rank && I < 1 && aN0 ? aN0 : + ( 1 < Rank && I < 2 && I == count_non_zero(aN0) && aN1 ? aN1 : + ( 2 < Rank && I < 3 && I == count_non_zero(aN0,aN1) && aN2 ? aN2 : + ( 3 < Rank && I < 4 && I == count_non_zero(aN0,aN1,aN2) && aN3 ? aN3 : + ( 4 < Rank && I < 5 && I == count_non_zero(aN0,aN1,aN2,aN3) && aN4 ? aN4 : + ( 5 < Rank && I < 6 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4) && aN5 ? aN5 : + ( 6 < Rank && I < 7 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4,aN5) && aN6 ? aN6 : + ( 7 < Rank && I < 8 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4,aN5,aN6) && aN7 ? aN7 : 0 )))))))); + } + + template< unsigned Rank , unsigned I , class DimRHS , class LayoutRHS > + KOKKOS_INLINE_FUNCTION + static constexpr size_t + get_non_zero( const size_t aN0 , const size_t aN1 , const size_t aN2 , const size_t aN3 + , const size_t aN4 , const size_t aN5 , const size_t aN6 , const size_t aN7 + , const ViewOffset< DimRHS , LayoutRHS , void > & rhs ) + { + return ( 0 < Rank && I < 1 && aN0 ? rhs.stride_0() : + ( 1 < Rank && I < 2 && I == count_non_zero(aN0) && aN1 ? rhs.stride_1() : + ( 2 < Rank && I < 3 && I == count_non_zero(aN0,aN1) && aN2 ? rhs.stride_2() : + ( 3 < Rank && I < 4 && I == count_non_zero(aN0,aN1,aN2) && aN3 ? rhs.stride_3() : + ( 4 < Rank && I < 5 && I == count_non_zero(aN0,aN1,aN2,aN3) && aN4 ? rhs.stride_4() : + ( 5 < Rank && I < 6 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4) && aN5 ? rhs.stride_5() : + ( 6 < Rank && I < 7 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4,aN5) && aN6 ? rhs.stride_6() : + ( 7 < Rank && I < 8 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4,aN5,aN6) && aN7 ? rhs.stride_7() : 0 )))))))); + } + + +public: + + template< class DimRHS , class LayoutRHS > + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset( const ViewOffset< DimRHS , LayoutRHS , void > & rhs + , const size_t aN0 + , const size_t aN1 + , const size_t aN2 + , const size_t aN3 + , const size_t aN4 + , const size_t aN5 + , const size_t aN6 + , const size_t aN7 + ) + // Contract the non-zero dimensions + : m_dim( ViewOffset::template get_non_zero<DimRHS::rank,0>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + , ViewOffset::template get_non_zero<DimRHS::rank,1>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + , ViewOffset::template get_non_zero<DimRHS::rank,2>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + , ViewOffset::template get_non_zero<DimRHS::rank,3>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + , ViewOffset::template get_non_zero<DimRHS::rank,4>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + , ViewOffset::template get_non_zero<DimRHS::rank,5>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + , ViewOffset::template get_non_zero<DimRHS::rank,6>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + , ViewOffset::template get_non_zero<DimRHS::rank,7>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 ) + ) + , m_stride( ViewOffset::template get_non_zero<DimRHS::rank,0>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs ) + , ViewOffset::template get_non_zero<DimRHS::rank,1>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs ) + , ViewOffset::template get_non_zero<DimRHS::rank,2>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs ) + , ViewOffset::template get_non_zero<DimRHS::rank,3>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs ) + , ViewOffset::template get_non_zero<DimRHS::rank,4>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs ) + , ViewOffset::template get_non_zero<DimRHS::rank,5>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs ) + , ViewOffset::template get_non_zero<DimRHS::rank,6>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs ) + , ViewOffset::template get_non_zero<DimRHS::rank,7>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs ) + ) + { + } + + //---------------------------------------- +}; + +}}} // namespace Kokkos::Experimental::Impl + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +struct ALL_t {}; + +template< class T > +struct ViewOffsetRange { + + static_assert( std::is_integral<T>::value , "Non-range must be an integral type" ); + + enum { is_range = false }; + + KOKKOS_INLINE_FUNCTION static + size_t dimension( size_t const , T const & ) { return 0 ; } + + KOKKOS_INLINE_FUNCTION static + size_t begin( T const & i ) { return size_t(i) ; } +}; + +template<> +struct ViewOffsetRange<void> { + enum { is_range = false }; +}; + +template<> +struct ViewOffsetRange< Kokkos::Experimental::Impl::ALL_t > { + enum { is_range = true }; + + KOKKOS_INLINE_FUNCTION static + size_t dimension( size_t const n , Experimental::Impl::ALL_t const & ) { return n ; } + + KOKKOS_INLINE_FUNCTION static + size_t begin( Experimental::Impl::ALL_t const & ) { return 0 ; } +}; + +template< typename iType > +struct ViewOffsetRange< std::pair<iType,iType> > { + + static_assert( std::is_integral<iType>::value , "Range bounds must be an integral type" ); + + enum { is_range = true }; + + KOKKOS_INLINE_FUNCTION static + size_t dimension( size_t const n , std::pair<iType,iType> const & r ) + { return ( size_t(r.first) < size_t(r.second) && size_t(r.second) <= n ) ? size_t(r.second) - size_t(r.first) : 0 ; } + + KOKKOS_INLINE_FUNCTION static + size_t begin( std::pair<iType,iType> const & r ) { return size_t(r.first) ; } +}; + +template< typename iType > +struct ViewOffsetRange< Kokkos::pair<iType,iType> > { + + static_assert( std::is_integral<iType>::value , "Range bounds must be an integral type" ); + + enum { is_range = true }; + + KOKKOS_INLINE_FUNCTION static + size_t dimension( size_t const n , Kokkos::pair<iType,iType> const & r ) + { return ( size_t(r.first) < size_t(r.second) && size_t(r.second) <= n ) ? size_t(r.second) - size_t(r.first) : 0 ; } + + KOKKOS_INLINE_FUNCTION static + size_t begin( Kokkos::pair<iType,iType> const & r ) { return size_t(r.first) ; } +}; + +template< typename iType > +struct ViewOffsetRange< std::initializer_list< iType > > { + + static_assert( std::is_integral<iType>::value , "Range bounds must be an integral type" ); + + enum { is_range = true }; + + KOKKOS_INLINE_FUNCTION static + size_t dimension( size_t const n , std::initializer_list< iType > const & r ) + { + return ( size_t(r.begin()[0]) < size_t(r.begin()[1]) && size_t(r.begin()[1]) <= n ) + ? size_t(r.begin()[1]) - size_t(r.begin()[0]) : 0 ; + } + + KOKKOS_INLINE_FUNCTION static + size_t begin( std::initializer_list< iType > const & r ) { return size_t(r.begin()[0]) ; } +}; + +}}} // namespace Kokkos::Experimental::Impl + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +/** \brief ViewDataHandle provides the type of the 'data handle' which the view + * uses to access data with the [] operator. It also provides + * an allocate function and a function to extract a raw ptr from the + * data handle. ViewDataHandle also defines an enum ReferenceAble which + * specifies whether references/pointers to elements can be taken and a + * 'return_type' which is what the view operators will give back. + * Specialisation of this object allows three things depending + * on ViewTraits and compiler options: + * (i) Use special allocator (e.g. huge pages/small pages and pinned memory) + * (ii) Use special data handle type (e.g. add Cuda Texture Object) + * (iii) Use special access intrinsics (e.g. texture fetch and non-caching loads) + */ +template< class Traits , class Enable = void > +struct ViewDataHandle { + + typedef typename Traits::value_type value_type ; + typedef typename Traits::value_type * handle_type ; + typedef typename Traits::value_type & return_type ; + typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ; + + KOKKOS_INLINE_FUNCTION + static handle_type assign( value_type * arg_data_ptr + , track_type const & /*arg_tracker*/ ) + { + return handle_type( arg_data_ptr ); + } +}; + +template< class Traits > +struct ViewDataHandle< Traits , + typename std::enable_if<( std::is_same< typename Traits::non_const_value_type + , typename Traits::value_type >::value + && + Traits::memory_traits::Atomic + )>::type > +{ + typedef typename Traits::value_type value_type ; + typedef typename Kokkos::Impl::AtomicViewDataHandle< Traits > handle_type ; + typedef typename Kokkos::Impl::AtomicDataElement< Traits > return_type ; + typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ; + + KOKKOS_INLINE_FUNCTION + static handle_type assign( value_type * arg_data_ptr + , track_type const & /*arg_tracker*/ ) + { + return handle_type( arg_data_ptr ); + } +}; + +}}} // namespace Kokkos::Experimental::Impl + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template< class Traits + , bool R0 = false + , bool R1 = false + , bool R2 = false + , bool R3 = false + , bool R4 = false + , bool R5 = false + , bool R6 = false + , bool R7 = false + , typename Enable = void > +struct SubviewMapping ; + +/** \brief View mapping for non-specialized data type and standard layout */ +template< class Traits > +class ViewMapping< Traits , void , + typename std::enable_if<( + std::is_same< typename Traits::specialize , void >::value + && + ( + std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value || + std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value || + std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value + ) + )>::type > +{ +private: + + template< class , class , typename > friend class ViewMapping ; + template< class , bool , bool , bool , bool , bool , bool , bool , bool , class > friend struct SubviewMapping ; + template< class , class , class , class > friend class Kokkos::Experimental::View ; + + typedef ViewOffset< typename Traits::dimension + , typename Traits::array_layout + , void + > offset_type ; + + typedef typename ViewDataHandle< Traits >::handle_type handle_type ; + + handle_type m_handle ; + offset_type m_offset ; + +public: + + //---------------------------------------- + // Domain dimensions + + enum { Rank = Traits::dimension::rank }; + + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_offset.dimension_0(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_offset.dimension_1(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_offset.dimension_2(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_offset.dimension_3(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_offset.dimension_4(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_offset.dimension_5(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_offset.dimension_6(); } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_offset.dimension_7(); } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_offset.stride_0(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_offset.stride_1(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_offset.stride_2(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_offset.stride_3(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_offset.stride_4(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_offset.stride_5(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_offset.stride_6(); } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_offset.stride_7(); } + + /* + KOKKOS_INLINE_FUNCTION + Kokkos::Array<size_t,Rank> dimension() const + { return Kokkos::Experimental::Impl::dimension( m_offset.m_dim ); } + */ + + //---------------------------------------- + // Range span + + /** \brief Span of the mapped range */ + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_offset.span(); } + + /** \brief Is the mapped range span contiguous */ + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_offset.span_is_contiguous(); } + + typedef typename ViewDataHandle< Traits >::return_type reference_type ; + + /** \brief If data references are lvalue_reference than can query pointer to memory */ + KOKKOS_INLINE_FUNCTION constexpr typename Traits::value_type * data() const + { + typedef typename Traits::value_type * ptr_type ; + + return std::is_lvalue_reference< reference_type >::value + ? (ptr_type) m_handle + : (ptr_type) 0 ; + } + + //---------------------------------------- + // The View class performs all rank and bounds checking before + // calling these element reference methods. + + KOKKOS_FORCEINLINE_FUNCTION + reference_type reference() const { return m_handle[0]; } + + template< typename I0 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type reference( const I0 & i0 ) const { return m_handle[i0]; } + + template< typename I0 , typename I1 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type reference( const I0 & i0 , const I1 & i1 ) const + { return m_handle[ m_offset(i0,i1) ]; } + + template< typename I0 , typename I1 , typename I2 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 ) const + { return m_handle[ m_offset(i0,i1,i2) ]; } + + template< typename I0 , typename I1 , typename I2 , typename I3 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 ) const + { return m_handle[ m_offset(i0,i1,i2,i3) ]; } + + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 ) const + { return m_handle[ m_offset(i0,i1,i2,i3,i4) ]; } + + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 ) const + { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5) ]; } + + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 , typename I6 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 , const I6 & i6 ) const + { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5,i6) ]; } + + template< typename I0 , typename I1 , typename I2 , typename I3 + , typename I4 , typename I5 , typename I6 , typename I7 > + KOKKOS_FORCEINLINE_FUNCTION + reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 + , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 ) const + { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ]; } + + //---------------------------------------- + +private: + + enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ }; + enum { MemorySpanSize = sizeof(typename Traits::value_type) }; + +public: + + /** \brief Span, in bytes, of the referenced memory */ + KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const + { + return ( m_offset.span() * sizeof(typename Traits::value_type) + MemorySpanMask ) & ~size_t(MemorySpanMask); + } + + /** \brief Span, in bytes, of the required memory */ + template< bool AllowPadding > + KOKKOS_INLINE_FUNCTION + static constexpr size_t memory_span( const std::integral_constant<bool,AllowPadding> & + , const size_t N0 , const size_t N1 , const size_t N2 , const size_t N3 + , const size_t N4 , const size_t N5 , const size_t N6 , const size_t N7 ) + { + typedef std::integral_constant< unsigned , AllowPadding ? MemorySpanSize : 0 > padding ; + return ( offset_type( padding(), N0, N1, N2, N3, N4, N5, N6, N7 ).span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask); + } + + /** \brief Span, in bytes, of the required memory */ + template< bool AllowPadding > + KOKKOS_INLINE_FUNCTION + static constexpr size_t memory_span( const std::integral_constant<bool,AllowPadding> & + , const typename Traits::array_layout & layout ) + { + return ( offset_type( layout ).span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask); + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION ~ViewMapping() {} + KOKKOS_INLINE_FUNCTION ViewMapping() : m_handle(), m_offset() {} + KOKKOS_INLINE_FUNCTION ViewMapping( const ViewMapping & rhs ) + : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ) {} + KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( const ViewMapping & rhs ) + { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; return *this ; } + + KOKKOS_INLINE_FUNCTION ViewMapping( ViewMapping && rhs ) + : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ) {} + KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( ViewMapping && rhs ) + { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; return *this ; } + + template< bool AllowPadding > + KOKKOS_INLINE_FUNCTION + ViewMapping( void * ptr + , const std::integral_constant<bool,AllowPadding> & + , const size_t N0 , const size_t N1 , const size_t N2 , const size_t N3 + , const size_t N4 , const size_t N5 , const size_t N6 , const size_t N7 ) + : m_handle( reinterpret_cast< handle_type >( ptr ) ) + , m_offset( std::integral_constant< unsigned , AllowPadding ? sizeof(typename Traits::value_type) : 0 >() + , N0, N1, N2, N3, N4, N5, N6, N7 ) + {} + + template< bool AllowPadding > + KOKKOS_INLINE_FUNCTION + ViewMapping( void * ptr + , const std::integral_constant<bool,AllowPadding> & + , const typename Traits::array_layout & layout ) + : m_handle( reinterpret_cast< handle_type >( ptr ) ) + , m_offset( layout ) + {} + + //---------------------------------------- + // If the View is to construct or destroy the elements. + + struct FunctorTagConstructScalar {}; + struct FunctorTagConstructNonScalar {}; + struct FunctorTagDestructNonScalar {}; + + KOKKOS_FORCEINLINE_FUNCTION + void operator()( const FunctorTagConstructScalar & , const size_t i ) const + { m_handle[i] = 0 ; } + + KOKKOS_FORCEINLINE_FUNCTION + void operator()( const FunctorTagConstructNonScalar & , const size_t i ) const + { + typedef typename Traits::value_type value_type ; + new( & m_handle[i] ) value_type(); + } + + KOKKOS_FORCEINLINE_FUNCTION + void operator()( const FunctorTagDestructNonScalar & , const size_t i ) const + { + typedef typename Traits::value_type value_type ; + ( & (m_handle[i]) )->~value_type(); + } + + template< class ExecSpace > + typename std::enable_if< Kokkos::Impl::is_execution_space<ExecSpace>::value && + std::is_scalar< typename Traits::value_type >::value >::type + construct( const ExecSpace & space ) const + { + typedef Kokkos::RangePolicy< ExecSpace , FunctorTagConstructScalar , size_t > Policy ; + + (void) Kokkos::Impl::ParallelFor< ViewMapping , Policy >( *this , Policy( 0 , m_offset.span() ) ); + } + + template< class ExecSpace > + typename std::enable_if< Kokkos::Impl::is_execution_space<ExecSpace>::value && + ! std::is_scalar< typename Traits::value_type >::value >::type + construct( const ExecSpace & space ) const + { + typedef Kokkos::RangePolicy< ExecSpace , FunctorTagConstructNonScalar , size_t > Policy ; + + (void) Kokkos::Impl::ParallelFor< ViewMapping , Policy >( *this , Policy( 0 , m_offset.span() ) ); + } + + template< class ExecSpace > + typename std::enable_if< Kokkos::Impl::is_execution_space<ExecSpace>::value && + std::is_scalar< typename Traits::value_type >::value >::type + destroy( const ExecSpace & ) const {} + + template< class ExecSpace > + typename std::enable_if< Kokkos::Impl::is_execution_space<ExecSpace>::value && + ! std::is_scalar< typename Traits::value_type >::value >::type + destroy( const ExecSpace & space ) const + { + typedef Kokkos::RangePolicy< ExecSpace , FunctorTagDestructNonScalar , size_t > Policy ; + + (void) Kokkos::Impl::ParallelFor< ViewMapping , Policy >( *this , Policy( 0 , m_offset.span() ) ); + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +/** \brief Assign compatible default mappings */ + +template< class DstTraits , class SrcTraits > +class ViewMapping< DstTraits , SrcTraits , + typename std::enable_if<( + std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value + && + std::is_same< typename DstTraits::specialize , void >::value + && + ( + std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value || + std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value || + std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value + ) + && + std::is_same< typename SrcTraits::specialize , void >::value + && + ( + std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value || + std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value || + std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value + ) + )>::type > +{ +public: + + enum { is_assignable = true }; + + typedef Kokkos::Experimental::Impl::SharedAllocationTracker TrackType ; + typedef ViewMapping< DstTraits , void , void > DstType ; + typedef ViewMapping< SrcTraits , void , void > SrcType ; + + KOKKOS_INLINE_FUNCTION + static void assign( DstType & dst , const SrcType & src , const TrackType & src_track ) + { + static_assert( std::is_same< typename DstTraits::value_type , typename SrcTraits::value_type >::value || + std::is_same< typename DstTraits::value_type , typename SrcTraits::const_value_type >::value + , "View assignment must have same value type or const = non-const" ); + + static_assert( ViewDimensionAssignable< typename DstTraits::dimension , typename SrcTraits::dimension >::value + , "View assignment must have compatible dimensions" ); + + static_assert( std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value || + std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value || + ( DstTraits::dimension::rank == 0 ) || + ( DstTraits::dimension::rank == 1 && DstTraits::dimension::rank_dynamic == 1 ) + , "View assignment must have compatible layout or have rank <= 1" ); + + typedef typename DstType::offset_type dst_offset_type ; + + dst.m_offset = dst_offset_type( src.m_offset ); + dst.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track ); + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +/** \brief View mapping for non-specialized data type and standard layout */ +template< class Traits , bool R0 , bool R1 , bool R2 , bool R3 , bool R4 , bool R5 , bool R6 , bool R7 > +struct SubviewMapping< Traits, R0, R1, R2, R3, R4, R5, R6, R7 , + typename std::enable_if<( + std::is_same< typename Traits::specialize , void >::value + && + ( + std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value || + std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value || + std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value + ) + )>::type > +{ +private: + + // Subview's rank + enum { rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) }; + + // Whether right-most rank is a range. + enum { R0_rev = 0 == Traits::rank ? false : ( + 1 == Traits::rank ? R0 : ( + 2 == Traits::rank ? R1 : ( + 3 == Traits::rank ? R2 : ( + 4 == Traits::rank ? R3 : ( + 5 == Traits::rank ? R4 : ( + 6 == Traits::rank ? R5 : ( + 7 == Traits::rank ? R6 : R7 ))))))) }; + + // Subview's layout + typedef typename std::conditional< + ( /* Same array layout IF */ + ( rank == 0 ) /* output rank zero */ + || + // OutputRank 1 or 2, InputLayout Left, Interval 0 + // because single stride one or second index has a stride. + ( rank <= 2 && R0 && std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ) + || + // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1] + // because single stride one or second index has a stride. + ( rank <= 2 && R0_rev && std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ) + ), typename Traits::array_layout , Kokkos::LayoutStride + >::type array_layout ; + + typedef typename Traits::value_type value_type ; + + typedef typename std::conditional< rank == 0 , value_type , + typename std::conditional< rank == 1 , value_type * , + typename std::conditional< rank == 2 , value_type ** , + typename std::conditional< rank == 3 , value_type *** , + typename std::conditional< rank == 4 , value_type **** , + typename std::conditional< rank == 5 , value_type ***** , + typename std::conditional< rank == 6 , value_type ****** , + typename std::conditional< rank == 7 , value_type ******* , + value_type ******** + >::type >::type >::type >::type >::type >::type >::type >::type + data_type ; + +public: + + typedef + Kokkos::Experimental::ViewTraits< data_type , array_layout + , typename Traits::device_type + , typename Traits::memory_traits > traits_type ; + + typedef Kokkos::Experimental::View< data_type + , array_layout + , typename Traits::device_type + , typename Traits::memory_traits > type ; + + template< class T0 , class T1 , class T2 , class T3 + , class T4 , class T5 , class T6 , class T7 > + KOKKOS_INLINE_FUNCTION + static void assign( ViewMapping< traits_type , void , void > & dst + , ViewMapping< Traits , void , void > const & src + , T0 const & arg0 + , T1 const & arg1 + , T2 const & arg2 + , T3 const & arg3 + , T4 const & arg4 + , T5 const & arg5 + , T6 const & arg6 + , T7 const & arg7 + ) + { + typedef ViewMapping< traits_type , void , void > DstType ; + + typedef typename DstType::offset_type dst_offset_type ; + typedef typename DstType::handle_type dst_handle_type ; + + typedef Kokkos::Experimental::Impl::ViewOffsetRange<T0> V0 ; + typedef Kokkos::Experimental::Impl::ViewOffsetRange<T1> V1 ; + typedef Kokkos::Experimental::Impl::ViewOffsetRange<T2> V2 ; + typedef Kokkos::Experimental::Impl::ViewOffsetRange<T3> V3 ; + typedef Kokkos::Experimental::Impl::ViewOffsetRange<T4> V4 ; + typedef Kokkos::Experimental::Impl::ViewOffsetRange<T5> V5 ; + typedef Kokkos::Experimental::Impl::ViewOffsetRange<T6> V6 ; + typedef Kokkos::Experimental::Impl::ViewOffsetRange<T7> V7 ; + + dst.m_offset = dst_offset_type + ( src.m_offset + , V0::dimension( src.m_offset.dimension_0() , arg0 ) + , V1::dimension( src.m_offset.dimension_1() , arg1 ) + , V2::dimension( src.m_offset.dimension_2() , arg2 ) + , V3::dimension( src.m_offset.dimension_3() , arg3 ) + , V4::dimension( src.m_offset.dimension_4() , arg4 ) + , V5::dimension( src.m_offset.dimension_5() , arg5 ) + , V6::dimension( src.m_offset.dimension_6() , arg6 ) + , V7::dimension( src.m_offset.dimension_7() , arg7 ) + ); + + dst.m_handle = dst_handle_type( src.m_handle + + src.m_offset( V0::begin( arg0 ) + , V1::begin( arg1 ) + , V2::begin( arg2 ) + , V3::begin( arg3 ) + , V4::begin( arg4 ) + , V5::begin( arg5 ) + , V6::begin( arg6 ) + , V7::begin( arg7 ) + ) ); + } +}; + +}}} // namespace Kokkos::Experimental::Impl + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template< class V + , bool R0 = false , bool R1 = false , bool R2 = false , bool R3 = false + , bool R4 = false , bool R5 = false , bool R6 = false , bool R7 = false > +struct SubviewType ; + +template< class D , class A1, class A2, class A3 + , bool R0 , bool R1 , bool R2 , bool R3 + , bool R4 , bool R5 , bool R6 , bool R7 > +struct SubviewType< Kokkos::Experimental::View< D , A1, A2, A3 > , R0 , R1 , R2 , R3 , R4 , R5 , R6 , R7 > +{ +private: + typedef Kokkos::Experimental::ViewTraits< D , A1 , A2 , A3 > traits ; + typedef Kokkos::Experimental::Impl::SubviewMapping< traits , R0 , R1 , R2 , R3 , R4 , R5 , R6 , R7 > mapping ; +public: + typedef typename mapping::type type ; +}; + +}}} // namespace Kokkos::Experimental::Impl + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +class Error_view_scalar_reference_to_non_scalar_view ; + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + +#if defined( KOKKOS_EXPRESSION_CHECK ) + +#define KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( SPACE , MAP , RANK , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) \ + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \ + Kokkos::Impl::ActiveExecutionMemorySpace , SPACE >::verify( MAP.data() ); \ + /* array bounds checking */ + +#else + +#define KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( SPACE , MAP , RANK , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) \ + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \ + Kokkos::Impl::ActiveExecutionMemorySpace , SPACE >::verify( MAP.data() ) + +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp new file mode 100755 index 0000000000000000000000000000000000000000..7fb33853d667c829417bffda2146e4149c3cf2d2 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp @@ -0,0 +1,844 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core_fwd.hpp> + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + +#include <Kokkos_Atomic.hpp> + +#include <impl/Kokkos_Singleton.hpp> +#include <impl/Kokkos_AllocationTracker.hpp> +#include <impl/Kokkos_Error.hpp> + + +#include <string> +#include <vector> +#include <sstream> +#include <algorithm> +#include <utility> +#include <cstdlib> +#include <cstring> +#include <iostream> +#include <iomanip> + +/* Enable clean up of memory leaks */ +#define CLEAN_UP_MEMORY_LEAKS 0 + +namespace Kokkos { namespace Impl { + +namespace { + + +//----------------------------------------------------------------------------- +// AllocationRecord +//----------------------------------------------------------------------------- +// +// Used to track details about an allocation and provide a ref count +// sizeof(AllocationRecord) == 128 +struct AllocationRecord +{ + enum { + OFFSET = sizeof(AllocatorBase*) // allocator + + sizeof(void*) // alloc_ptr + + sizeof(uint64_t) // alloc_size + + sizeof(AllocatorAttributeBase*) // attribute + + sizeof(uint32_t) // node_index + + sizeof(uint32_t) // ref_count + , LABEL_LENGTH = 128 - OFFSET + }; + + AllocatorBase * const allocator; + void * const alloc_ptr; + const uint64_t alloc_size; + AllocatorAttributeBase * const attribute; + const int32_t node_index; + volatile uint32_t ref_count; + const char label[LABEL_LENGTH]; + + + AllocationRecord( AllocatorBase * const arg_allocator + , void * arg_alloc_ptr + , uint64_t arg_alloc_size + , int32_t arg_node_index + , const std::string & arg_label + ) + : allocator(arg_allocator) + , alloc_ptr(arg_alloc_ptr) + , alloc_size(arg_alloc_size) + , attribute(NULL) + , node_index(arg_node_index) + , ref_count(1) + , label() // zero fill + { + const size_t length = static_cast<size_t>(LABEL_LENGTH-1u) < arg_label.size() ? static_cast<size_t>(LABEL_LENGTH-1u) : arg_label.size(); + strncpy( const_cast<char *>(label), arg_label.c_str(), length ); + } + + ~AllocationRecord() + { + if (attribute) { + delete attribute; + } + } + + uint32_t increment_ref_count() + { + uint32_t old_value = atomic_fetch_add( &ref_count, static_cast<uint32_t>(1) ); + return old_value + 1u; + } + + uint32_t decrement_ref_count() + { + uint32_t old_value = atomic_fetch_sub( &ref_count, static_cast<uint32_t>(1) ); + return old_value - 1u; + } + + void print( std::ostream & oss ) const + { + oss << "{ " << allocator->name() + << " } : \"" << label + << "\" ref_count(" << ref_count + << ") memory[ " << alloc_ptr + << " + " << alloc_size + << " ]" ; + } + + bool set_attribute( AllocatorAttributeBase * attr ) + { + bool result = false; + if (attribute == NULL) { + result = NULL == atomic_compare_exchange( const_cast<AllocatorAttributeBase **>(&attribute) + , reinterpret_cast<AllocatorAttributeBase *>(NULL) + , attr ); + } + + return result; + } + + // disallow copy and assignment + AllocationRecord( const AllocationRecord & ); + AllocationRecord & operator=(const AllocationRecord &); +}; + +template <int NumBlocks> +struct Bitset +{ + enum { blocks = NumBlocks }; + enum { size = blocks * 64 }; + enum { block_mask = 63u }; + enum { block_shift = 6 }; + + // used to find free bits in a bitset + static int count_trailing_zeros(uint64_t x) + { + #if defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG ) || defined( KOKKOS_COMPILER_APPLECC ) + return x ? __builtin_ctzll(x) : 64; + #elif defined( KOKKOS_COMPILER_INTEL ) + enum { shift = 32 }; + enum { mask = (static_cast<uint64_t>(1) << shift) - 1u }; + return (x & mask) ? _bit_scan_forward(static_cast<int>(x & mask)) : + (x >> shift) ? shift + _bit_scan_forward(static_cast<int>(x >> shift)) : + 64 ; + #elif defined( KOKKOS_COMPILER_IBM ) + return x ? __cnttz8(x) : 64; + #else + int i = 0; + for (; ((x & (static_cast<uint64_t>(1) << i)) == 0u) && i < 64; ++i ) {} + return i; + #endif + } + + Bitset() + : m_bits() + { + for (int i=0; i < blocks; ++i) { + m_bits[i] = 0u; + } + } + + bool set( int i ) + { + const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask ); + return !( atomic_fetch_or( m_bits + (i >> block_shift), bit ) & bit ); + } + + bool reset( int i ) + { + const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask ); + return atomic_fetch_and( m_bits + (i >> block_shift), ~bit ) & bit; + } + + bool test( int i ) + { + const uint64_t block = m_bits[ i >> block_shift ]; + const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask ); + return block & bit; + } + + int find_first_unset() const + { + for (int i=0; i < blocks; ++i) { + const uint64_t block = m_bits[i]; + int b = count_trailing_zeros( ~block ); + + if ( b < 64 ) { + return (i << block_shift) + b; + } + } + return size; + } + + volatile uint64_t m_bits[blocks]; +}; + +//----------------------------------------------------------------------------- +// AllocationRecordPool -- singleton class +// +// global_alloc_rec_pool is the ONLY instance of this class +// +//----------------------------------------------------------------------------- +// Record AllocationRecords in a lock-free circular list. +// Each node in the list has a buffer with space for 959 ((15*64)-1) records +// managed by a bitset. Atomics are used to set and reset bits in the bit set. +// The head of the list is atomically updated to the last node found with +// unused space. +// +// Cost time to create an allocation record: amortized O(1), worst case O(num nodes) +// Cost to destroy an allocation recored: O(1) +// +// Singleton allocations are pushed onto a lock-free stack that is destroyed +// after the circular list of allocation records. +struct AllocationRecordPool +{ + enum { BITSET_BLOCKS = 15 }; + + typedef Bitset<BITSET_BLOCKS> bitset_type; + + enum { BUFFER_SIZE = (bitset_type::size - 1) * sizeof(AllocationRecord) }; + + struct AllocationNode + { + AllocationNode() + : next() + , bitset() + , buffer() + { + // set the first bit to used + bitset.set(0); + } + + void * get_buffer( int32_t node_index ) + { + return buffer + (node_index-1) * sizeof(AllocationRecord); + } + + // return 0 if no space is available in the node + int32_t get_node_index() + { + int32_t node_index = 0; + do { + node_index = bitset.find_first_unset(); + + // successfully claimed a bit + if ( node_index != bitset.size && bitset.set(node_index) ) + { + return node_index; + } + } while ( node_index != bitset.size ); + return 0; + } + + void clear_node_index( int32_t node_index ) + { + bitset.reset(node_index); + } + + AllocationNode * next; + bitset_type bitset; + char buffer[BUFFER_SIZE]; + }; + + struct SingletonNode + { + void * buffer; + SingletonNode * next; + Impl::singleton_destroy_function_type destroy; + + SingletonNode( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func ) + : buffer(NULL) + , next(NULL) + , destroy(destroy_func) + { + if (size) { + buffer = malloc(size); + create_func(buffer); + } + } + + ~SingletonNode() + { + if (buffer) { + try { + destroy(buffer); + } catch(...) {} + free(buffer); + } + } + }; + + AllocationRecordPool() + : head( new AllocationNode() ) + , singleton_head(NULL) + { + // setup ring + head->next = head; + } + + ~AllocationRecordPool() + { + // delete allocation records + { + AllocationNode * start = head; + + AllocationNode * curr = start; + + std::vector< std::string > string_vec; + + do { + AllocationNode * next = curr->next; + + #if defined( KOKKOS_DEBUG_PRINT_ALLOCATION_BITSET ) + // print node bitset + for (int i=0; i < bitset_type::blocks; ++i ) { + std::cout << std::hex << std::showbase << curr->bitset.m_bits[i] << " "; + } + std::cout << std::endl; + #endif + + // bit zero does not map to an AllocationRecord + for ( int32_t i=1; i < bitset_type::size; ++i ) + { + if (curr->bitset.test(i)) { + AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) ); + + std::ostringstream oss; + alloc_rec->print( oss ); + string_vec.push_back( oss.str() ); + +#if CLEAN_UP_MEMORY_LEAKS +/* Cleaning up memory leaks prevents memory error detection tools + * from reporting the original source of allocation, which can + * impede debugging with such tools. + */ + try { + destroy(alloc_rec); + } + catch(...) {} +#endif + } + } + + curr->next = NULL; + + delete curr; + + curr = next; + } while ( curr != start ); + + if ( !string_vec.empty() ) { + std::sort( string_vec.begin(), string_vec.end() ); + + std::ostringstream oss; + oss << "Error: Allocation pool destroyed with the following memory leak(s):\n"; + for (size_t i=0; i< string_vec.size(); ++i) + { + oss << " " << string_vec[i] << std::endl; + } + + std::cerr << oss.str() << std::endl; + } + } + + // delete singletons + { + SingletonNode * curr = singleton_head; + + while (curr) { + SingletonNode * next = curr->next; + delete curr; + curr = next; + } + } + } + + AllocationRecord * create( AllocatorBase * arg_allocator + , void * arg_alloc_ptr + , size_t arg_alloc_size + , const std::string & arg_label + ) + { + AllocationNode * start = volatile_load(&head); + + AllocationNode * curr = start; + + + int32_t node_index = curr->get_node_index(); + + if (node_index == 0) { + curr = volatile_load(&curr->next); + } + + while (node_index == 0 && curr != start) + { + node_index = curr->get_node_index(); + if (node_index == 0) { + curr = volatile_load(&curr->next); + } + } + + // Need to allocate and insert a new node + if (node_index == 0 && curr == start) + { + AllocationNode * new_node = new AllocationNode(); + + node_index = new_node->get_node_index(); + + AllocationNode * next = NULL; + do { + next = volatile_load(&curr->next); + new_node->next = next; + memory_fence(); + } while ( next != atomic_compare_exchange( &(curr->next), next, new_node ) ); + + curr = new_node; + } + + void * buffer = curr->get_buffer(node_index); + + // try to set head to curr + if ( start != curr ) + { + atomic_compare_exchange( & head, start, curr ); + } + + return new (buffer) AllocationRecord( arg_allocator + , arg_alloc_ptr + , arg_alloc_size + , node_index + , arg_label + ); + } + + void destroy( AllocationRecord * alloc_rec ) + { + if (alloc_rec) { + const int32_t node_index = alloc_rec->node_index; + AllocationNode * node = get_node( alloc_rec ); + + // deallocate memory + alloc_rec->allocator->deallocate( alloc_rec->alloc_ptr, alloc_rec->alloc_size ); + + // call destructor + alloc_rec->~AllocationRecord(); + + // wait for writes to complete + memory_fence(); + + // clear node index + node->clear_node_index( node_index ); + } + } + + void * create_singleton( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func ) + { + SingletonNode * node = new SingletonNode( size, create_func, destroy_func ); + SingletonNode * next; + + // insert new node at the head of the list + do { + next = volatile_load(&singleton_head); + node->next = next; + } while ( next != atomic_compare_exchange( &singleton_head, next, node ) ); + + return node->buffer; + } + + void print_memory( std::ostream & out ) const + { + AllocationNode * start = head; + + AllocationNode * curr = start; + + std::vector< std::string > string_vec; + + do { + AllocationNode * next = curr->next; + + // bit zero does not map to an AllocationRecord + for ( int32_t i=1; i < bitset_type::size; ++i ) + { + if (curr->bitset.test(i)) { + AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) ); + + std::ostringstream oss; + alloc_rec->print( oss ); + string_vec.push_back( oss.str() ); + } + } + curr = next; + } while ( curr != start ); + + if ( !string_vec.empty() ) { + std::sort( string_vec.begin(), string_vec.end() ); + + std::ostringstream oss; + oss << "Tracked Memory:" << std::endl; + for (size_t i=0; i< string_vec.size(); ++i) + { + oss << " " << string_vec[i] << std::endl; + } + out << oss.str() << std::endl; + } + else { + out << "No Tracked Memory" << std::endl; + } + } + + // find an AllocationRecord such that + // alloc_ptr <= ptr < alloc_ptr + alloc_size + // otherwise return NULL + AllocationRecord * find( void const * ptr, AllocatorBase const * allocator ) const + { + AllocationNode * start = head; + + AllocationNode * curr = start; + + char const * const char_ptr = reinterpret_cast<const char *>(ptr); + + do { + AllocationNode * next = curr->next; + + // bit zero does not map to an AllocationRecord + for ( int32_t i=1; i < bitset_type::size; ++i ) + { + if (curr->bitset.test(i)) { + AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) ); + + char const * const alloc_ptr = reinterpret_cast<char const *>(alloc_rec->alloc_ptr); + + if ( (allocator == alloc_rec->allocator) + && (alloc_ptr <= char_ptr) + && (char_ptr < (alloc_ptr + alloc_rec->alloc_size)) ) + { + return alloc_rec; + } + } + } + curr = next; + } while ( curr != start ); + + return NULL; + } + +private: + + AllocationNode * get_node( AllocationRecord * alloc_rec ) + { + return reinterpret_cast<AllocationNode *>( alloc_rec - alloc_rec->node_index); + } + + AllocationNode * head; + SingletonNode * singleton_head; +}; + +// create the global pool for allocation records +AllocationRecordPool global_alloc_rec_pool; + + + +// convert a uintptr_t to an AllocationRecord pointer +inline +AllocationRecord * to_alloc_rec( uintptr_t alloc_rec ) +{ + return reinterpret_cast<AllocationRecord *>( alloc_rec & ~static_cast<uintptr_t>(1) ); +} + +} // unnamed namespace + +//----------------------------------------------------------------------------- +// Allocation Tracker methods +//----------------------------------------------------------------------------- + +// Create a reference counted AllocationTracker +void AllocationTracker::initalize( AllocatorBase * arg_allocator + , void * arg_alloc_ptr + , size_t arg_alloc_size + , const std::string & arg_label + ) +{ + if ( arg_allocator && arg_alloc_ptr && arg_alloc_size) { + // create record + AllocationRecord * alloc_rec = global_alloc_rec_pool.create( arg_allocator + , arg_alloc_ptr + , arg_alloc_size + , arg_label + ); + + m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT; + } +} + +void AllocationTracker::reallocate( size_t size ) const +{ + AllocationRecord * rec = to_alloc_rec( m_alloc_rec ); + + void * the_alloc_ptr = rec->allocator->reallocate( rec->alloc_ptr, rec->alloc_size, size ); + + if ( NULL != the_alloc_ptr ) + { + *const_cast<void **>(&rec->alloc_ptr) = the_alloc_ptr; + *const_cast<uint64_t *>(&rec->alloc_size) = size; + } + else { + Impl::throw_runtime_exception( "Error: unable to reallocate allocation tracker"); + } +} + + +void AllocationTracker::increment_ref_count() const +{ + to_alloc_rec( m_alloc_rec )->increment_ref_count(); +} + + +void AllocationTracker::decrement_ref_count() const +{ + AllocationRecord * alloc_rec = to_alloc_rec( m_alloc_rec ); + uint32_t the_ref_count = alloc_rec->decrement_ref_count(); + if (the_ref_count == 0u) { + try { + global_alloc_rec_pool.destroy( alloc_rec ); + } + catch(...) {} + } +} + +namespace { + +struct NullAllocator { static const char * name() { return "Null Allocator"; } }; + +} + +AllocatorBase * AllocationTracker::allocator() const +{ + if (m_alloc_rec & REF_COUNT_MASK) { + return to_alloc_rec(m_alloc_rec)->allocator; + } + return Allocator<NullAllocator>::singleton(); +} + +void * AllocationTracker::alloc_ptr() const +{ + if (m_alloc_rec & REF_COUNT_MASK) { + return to_alloc_rec(m_alloc_rec)->alloc_ptr; + } + return NULL; +} + +size_t AllocationTracker::alloc_size() const +{ + if (m_alloc_rec & REF_COUNT_MASK) { + return to_alloc_rec(m_alloc_rec)->alloc_size; + } + return 0u; +} + +size_t AllocationTracker::ref_count() const +{ + if (m_alloc_rec & REF_COUNT_MASK) { + return to_alloc_rec(m_alloc_rec)->ref_count; + } + return 0u; +} + +char const * AllocationTracker::label() const +{ + if (m_alloc_rec & REF_COUNT_MASK) { + return to_alloc_rec(m_alloc_rec)->label; + } + return "[Empty Allocation Tracker]"; +} + +void AllocationTracker::print( std::ostream & oss) const +{ + if (m_alloc_rec & REF_COUNT_MASK) { + to_alloc_rec(m_alloc_rec)->print(oss); + } + else { + oss << label(); + } +} + +bool AllocationTracker::set_attribute( AllocatorAttributeBase * attr ) const +{ + bool result = false; + if (m_alloc_rec & REF_COUNT_MASK) { + result = to_alloc_rec(m_alloc_rec)->set_attribute(attr); + } + return result; +} + +AllocatorAttributeBase * AllocationTracker::attribute() const +{ + if (m_alloc_rec & REF_COUNT_MASK) { + return to_alloc_rec(m_alloc_rec)->attribute; + } + return NULL; +} + +void AllocationTracker::print_tracked_memory( std::ostream & out ) +{ + global_alloc_rec_pool.print_memory( out ); +} + + +AllocationTracker AllocationTracker::find( void const * ptr, AllocatorBase const * arg_allocator ) +{ + AllocationRecord * alloc_rec = global_alloc_rec_pool.find(ptr, arg_allocator); + + AllocationTracker tracker; + + if ( alloc_rec != NULL ) + { + if ( tracking_enabled() ) { + alloc_rec->increment_ref_count(); + tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT; + } + else { + tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec); + } + } + + return tracker ; +} + + + +//----------------------------------------------------------------------------- +// static AllocationTracker +//----------------------------------------------------------------------------- +#if defined( KOKKOS_USE_DECENTRALIZED_HOST ) +namespace { + + // TODO : Detect compiler support for thread local variables + #if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) + bool g_thread_local_tracking_enabled = true; + #pragma omp threadprivate(g_thread_local_tracking_enabled) + #elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) + __thread bool g_thread_local_tracking_enabled = true; + #elif defined( KOKKOS_HAVE_OPENMP ) + bool g_thread_local_tracking_enabled = true; + #pragma omp threadprivate(g_thread_local_tracking_enabled) + #elif defined( KOKKOS_HAVE_PTHREAD ) + __thread bool g_thread_local_tracking_enabled = true; + #elif defined( KOKKOS_HAVE_SERIAL ) + bool g_thread_local_tracking_enabled = true; + #endif +} // unnamed namespace + +void AllocationTracker::disable_tracking() +{ + g_thread_local_tracking_enabled = false; +} + +void AllocationTracker::enable_tracking() +{ + g_thread_local_tracking_enabled = true; +} + +bool AllocationTracker::tracking_enabled() +{ + return g_thread_local_tracking_enabled; +} +#else +namespace { +enum TrackingEnum { TRACKING_ENABLED, TRACKING_DISABLED }; +volatile TrackingEnum g_tracking_enabled = TRACKING_ENABLED; +} + +void AllocationTracker::disable_tracking() +{ + if ( TRACKING_ENABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_ENABLED, TRACKING_DISABLED ) ) { + Impl::throw_runtime_exception("Error: Tracking already disabled"); + } +} + +void AllocationTracker::enable_tracking() +{ + if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) { + Impl::throw_runtime_exception("Error: Tracking already enabled"); + } +} + +bool AllocationTracker::tracking_enabled() +{ + return g_tracking_enabled == TRACKING_ENABLED; +} +#endif + + +//----------------------------------------------------------------------------- +// create singleton free function +//----------------------------------------------------------------------------- +void * create_singleton( size_t size + , Impl::singleton_create_function_type create_func + , Impl::singleton_destroy_function_type destroy_func ) +{ + return global_alloc_rec_pool.create_singleton( size, create_func, destroy_func ); +} + +}} // namespace Kokkos::Impl + +#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp new file mode 100755 index 0000000000000000000000000000000000000000..331c4e8facb1e0951082cd9a715a019ee3f0c5cd --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp @@ -0,0 +1,586 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_ALLOCATION_TRACKER_HPP +#define KOKKOS_ALLOCATION_TRACKER_HPP + +#include <Kokkos_Macros.hpp> + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Error.hpp> + +#include <stdint.h> +#include <cstdlib> +#include <string> +#include <iosfwd> + +namespace Kokkos { namespace Impl { + +//----------------------------------------------------------------------------- +// Create Singleton objects +//----------------------------------------------------------------------------- + +typedef void * (*singleton_create_function_type)(void * buffer); +typedef void (*singleton_destroy_function_type)(void *); + +void * create_singleton( size_t size + , singleton_create_function_type create_func + , singleton_destroy_function_type destroy_func + ); + + + +/// class Singleton +/// +/// Default construct a singleton type. This method is used to circumvent +/// order of construction issues. Singleton objects are destroyed after all +/// other allocations in the reverse order of their creation. +template <typename Type> +class Singleton +{ +public: + /// Get a pointer to the Singleton. Default construct the singleton if it does not already exist + static Type * get() + { + static Type * singleton = NULL; + if (singleton == NULL) { + Impl::singleton_create_function_type create_func = &create; + Impl::singleton_destroy_function_type destroy_func = &destroy; + singleton = reinterpret_cast<Type*>( Impl::create_singleton( sizeof(Type), create_func, destroy_func ) ); + } + return singleton; + } + +private: + + /// Call the Type constructor + static void destroy(void * ptr) + { + reinterpret_cast<Type*>(ptr)->~Type(); + } + + /// placement new the Type in buffer + static void * create(void * buffer) + { + return new (buffer) Type(); + } +}; + + +//----------------------------------------------------------------------------- +// AllocatorBase +//----------------------------------------------------------------------------- + +/// class AllocatorBase +/// +/// Abstract base class for all Allocators. +/// Allocators should be singleton objects, use Singleton<Allocator>::get to create +/// to avoid order of destruction issues +class AllocatorBase +{ +public: + /// name of the allocator + /// used to report memory leaks + virtual const char * name() const = 0; + + /// Allocate a buffer of size number of bytes + virtual void* allocate(size_t size) const = 0; + + /// Deallocate a buffer with size number of bytes + /// The pointer must have been allocated with a call to corresponding allocate + virtual void deallocate(void * ptr, size_t size) const = 0; + + /// Changes the size of the memory block pointed to by ptr. + /// Ptr must have been allocated with the corresponding allocate call + /// The function may move the memory block to a new location + /// (whose address is returned by the function). + /// + /// The content of the memory block is preserved up to the lesser of the new and + /// old sizes, even if the block is moved to a new location. If the new size is larger, + /// the value of the newly allocated portion is indeterminate. + /// + /// In case that ptr is a null pointer, the function behaves like allocate, assigning a + /// new block of size bytes and returning a pointer to its beginning. + virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const = 0; + + /// can a texture object be bound to the allocated memory + virtual bool support_texture_binding() const = 0; + + /// virtual destructor + virtual ~AllocatorBase() {} +}; + +/// class AllocatorAttributeBase +class AllocatorAttributeBase +{ +public: + virtual ~AllocatorAttributeBase() {} +}; + +//----------------------------------------------------------------------------- +// Allocator< StaticAllocator > : public AllocatorBase +//----------------------------------------------------------------------------- + +// HasStaticName +template<typename T> +class HasStaticName +{ + typedef const char * (*static_method)(); + template<typename U, static_method> struct SFINAE {}; + template<typename U> static char Test(SFINAE<U, &U::name>*); + template<typename U> static int Test(...); +public: + enum { value = sizeof(Test<T>(0)) == sizeof(char) }; +}; + + +template <typename T> +inline +typename enable_if<HasStaticName<T>::value, const char *>::type +allocator_name() +{ + return T::name(); +} + +template <typename T> +inline +typename enable_if<!HasStaticName<T>::value, const char *>::type +allocator_name() +{ + return "Unnamed Allocator"; +} + + +// HasStaticAllocate +template<typename T> +class HasStaticAllocate +{ + typedef void * (*static_method)(size_t); + template<typename U, static_method> struct SFINAE {}; + template<typename U> static char Test(SFINAE<U, &U::allocate>*); + template<typename U> static int Test(...); +public: + enum { value = sizeof(Test<T>(0)) == sizeof(char) }; +}; + +template <typename T> +inline +typename enable_if<HasStaticAllocate<T>::value, void *>::type +allocator_allocate(size_t size) +{ + return T::allocate(size); +} + +template <typename T> +inline +typename enable_if<!HasStaticAllocate<T>::value, void *>::type +allocator_allocate(size_t) +{ + throw_runtime_exception( std::string("Error: ") + + std::string(allocator_name<T>()) + + std::string(" cannot allocate memory!") ); + return NULL; +} + +// HasStaticDeallocate +template<typename T> +class HasStaticDeallocate +{ + typedef void (*static_method)(void *, size_t); + template<typename U, static_method> struct SFINAE {}; + template<typename U> static char Test(SFINAE<U, &U::deallocate>*); + template<typename U> static int Test(...); +public: + enum { value = sizeof(Test<T>(0)) == sizeof(char) }; +}; + +template <typename T> +inline +typename enable_if<HasStaticDeallocate<T>::value, void>::type +allocator_deallocate(void * ptr, size_t size) +{ + T::deallocate(ptr,size); +} + +template <typename T> +inline +typename enable_if<!HasStaticDeallocate<T>::value, void>::type +allocator_deallocate(void *, size_t) +{ + throw_runtime_exception( std::string("Error: ") + + std::string(allocator_name<T>()) + + std::string(" cannot deallocate memory!") ); +} + +// HasStaticReallocate +template<typename T> +class HasStaticReallocate +{ + typedef void * (*static_method)(void *, size_t, size_t); + template<typename U, static_method> struct SFINAE {}; + template<typename U> static char Test(SFINAE<U, &U::reallocate>*); + template<typename U> static int Test(...); +public: + enum { value = sizeof(Test<T>(0)) == sizeof(char) }; +}; + +template <typename T> +inline +typename enable_if<HasStaticReallocate<T>::value, void *>::type +allocator_reallocate(void * old_ptr, size_t old_size, size_t new_size) +{ + return T::reallocate(old_ptr, old_size, new_size); +} + +template <typename T> +inline +typename enable_if<!HasStaticReallocate<T>::value, void *>::type +allocator_reallocate(void *, size_t, size_t) +{ + throw_runtime_exception( std::string("Error: ") + + std::string(allocator_name<T>()) + + std::string(" cannot reallocate memory!") ); + return NULL; +} + +// HasStaticReallocate +template<typename T> +class HasStaticSupportTextureBinding +{ + typedef bool (*static_method)(); + template<typename U, static_method> struct SFINAE {}; + template<typename U> static char Test(SFINAE<U, &U::support_texture_binding>*); + template<typename U> static int Test(...); +public: + enum { value = sizeof(Test<T>(0)) == sizeof(char) }; +}; + +template <typename T> +inline +typename enable_if<HasStaticSupportTextureBinding<T>::value, bool>::type +allocator_support_texture_binding() +{ + return T::support_texture_binding(); +} + +template <typename T> +inline +typename enable_if<!HasStaticSupportTextureBinding<T>::value, bool>::type +allocator_support_texture_binding() +{ + return false; +} + +template <typename T> +class Allocator : public AllocatorBase +{ +public: + virtual const char * name() const + { + return allocator_name<T>(); + } + + virtual void* allocate(size_t size) const + { + return allocator_allocate<T>(size); + } + + virtual void deallocate(void * ptr, size_t size) const + { + allocator_deallocate<T>(ptr,size); + } + + virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const + { + return allocator_reallocate<T>(old_ptr, old_size, new_size); + } + + virtual bool support_texture_binding() const + { + return allocator_support_texture_binding<T>(); + } + + static AllocatorBase * singleton() + { + return Singleton< Allocator<T> >::get(); + } +}; + +//----------------------------------------------------------------------------- +// AllocationTracker +//----------------------------------------------------------------------------- + +// forward declaration for friend classes +struct CopyWithoutTracking; +struct MallocHelper; + +/// class AllocationTracker +/// Will call deallocate from the AllocatorBase when the reference count reaches 0. +/// Reference counting is disabled when the host is in parallel. +class AllocationTracker +{ + // use the least significant bit of the AllocationRecord pointer to indicate if the + // AllocationTracker should reference count + enum { + REF_COUNT_BIT = static_cast<uintptr_t>(1) + , REF_COUNT_MASK = ~static_cast<uintptr_t>(1) + }; + +public: + + /// Find an AllocationTracker such that + /// alloc_ptr <= ptr < alloc_ptr + alloc_size + /// O(n) where n is the number of tracked allocations. + template <typename StaticAllocator> + static AllocationTracker find( void const * ptr ) + { + return find( ptr, Allocator<StaticAllocator>::singleton() ); + } + + + /// Pretty print all the currently tracked memory + static void print_tracked_memory( std::ostream & out ); + + /// Default constructor + KOKKOS_INLINE_FUNCTION + AllocationTracker() + : m_alloc_rec(0) + {} + + /// Create a AllocationTracker + /// + /// Start reference counting the alloc_ptr. + /// When the reference count reachs 0 the allocator deallocate method + /// will be call with the given size. The alloc_ptr should have been + /// allocated with the allocator's allocate method. + /// + /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0 + /// do nothing + template <typename StaticAllocator> + AllocationTracker( StaticAllocator const & + , void * arg_alloc_ptr + , size_t arg_alloc_size + , const std::string & arg_label = std::string("") ) + : m_alloc_rec(0) + { + AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton(); + initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label); + } + + /// Create a AllocationTracker + /// + /// Start reference counting the alloc_ptr. + /// When the reference count reachs 0 the allocator deallocate method + /// will be call with the given size. The alloc_ptr should have been + /// allocated with the allocator's allocate method. + /// + /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0 + /// do nothing + template <typename StaticAllocator> + AllocationTracker( StaticAllocator const & + , size_t arg_alloc_size + , const std::string & arg_label = std::string("") + ) + : m_alloc_rec(0) + { + AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton(); + void * arg_alloc_ptr = arg_allocator->allocate( arg_alloc_size ); + + initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label); + } + + /// Copy an AllocatorTracker + KOKKOS_INLINE_FUNCTION + AllocationTracker( const AllocationTracker & rhs ) + : m_alloc_rec( rhs.m_alloc_rec) + { +#if !defined( __CUDA_ARCH__ ) + if ( rhs.ref_counting() && tracking_enabled() ) { + increment_ref_count(); + } + else { + m_alloc_rec = m_alloc_rec & REF_COUNT_MASK; + } +#else + m_alloc_rec = m_alloc_rec & REF_COUNT_MASK; +#endif + } + + /// Copy an AllocatorTracker + /// Decrement the reference count of the current tracker if necessary + KOKKOS_INLINE_FUNCTION + AllocationTracker & operator=( const AllocationTracker & rhs ) + { + if (this != &rhs) { +#if !defined( __CUDA_ARCH__ ) + if ( ref_counting() ) { + decrement_ref_count(); + } + + m_alloc_rec = rhs.m_alloc_rec; + + if ( rhs.ref_counting() && tracking_enabled() ) { + increment_ref_count(); + } + else { + m_alloc_rec = m_alloc_rec & REF_COUNT_MASK; + } +#else + m_alloc_rec = rhs.m_alloc_rec & REF_COUNT_MASK; +#endif + } + + return * this; + } + + /// Destructor + /// Decrement the reference count if necessary + KOKKOS_INLINE_FUNCTION + ~AllocationTracker() + { +#if !defined( __CUDA_ARCH__ ) + if ( ref_counting() ) { + decrement_ref_count(); + } +#endif + } + + /// Is the tracker valid? + KOKKOS_INLINE_FUNCTION + bool is_valid() const + { + return (m_alloc_rec & REF_COUNT_MASK); + } + + + + /// clear the tracker + KOKKOS_INLINE_FUNCTION + void clear() + { +#if !defined( __CUDA_ARCH__ ) + if ( ref_counting() ) { + decrement_ref_count(); + } +#endif + m_alloc_rec = 0; + } + + /// is this tracker currently counting allocations? + KOKKOS_INLINE_FUNCTION + bool ref_counting() const + { + return (m_alloc_rec & REF_COUNT_BIT); + } + + AllocatorBase * allocator() const; + + /// pointer to the allocated memory + void * alloc_ptr() const; + + /// size in bytes of the allocated memory + size_t alloc_size() const; + + /// the current reference count + size_t ref_count() const; + + /// the label given to the allocation + char const * label() const; + + /// pretty print all the tracker's information to the std::ostream + void print( std::ostream & oss) const; + + + /// set an attribute ptr on the allocation record + /// the arg_attribute pointer will be deleted when the record is destroyed + /// the attribute ptr can only be set once + bool set_attribute( AllocatorAttributeBase * arg_attribute) const; + + /// get the attribute ptr from the allocation record + AllocatorAttributeBase * attribute() const; + + + /// reallocate the memory tracked by this allocation + /// NOT thread-safe + void reallocate( size_t size ) const; + +private: + + static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator ); + + void initalize( AllocatorBase * arg_allocator + , void * arg_alloc_ptr + , size_t arg_alloc_size + , std::string const & label ); + + void increment_ref_count() const; + void decrement_ref_count() const; + + static void disable_tracking(); + static void enable_tracking(); + static bool tracking_enabled(); + + friend struct Impl::CopyWithoutTracking; + friend struct Impl::MallocHelper; + + uintptr_t m_alloc_rec; +}; + + + +/// Make a copy of the functor with reference counting disabled +struct CopyWithoutTracking +{ + template <typename Functor> + static Functor apply( const Functor & f ) + { + AllocationTracker::disable_tracking(); + Functor func(f); + AllocationTracker::enable_tracking(); + return func; + } +}; + +}} // namespace Kokkos::Impl + +#endif //KOKKOS_ALLOCATION_TRACKER_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp new file mode 100755 index 0000000000000000000000000000000000000000..2de9df008ee5b42b5d38727ead56bae768869c43 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp @@ -0,0 +1,260 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_ANALYZESHAPE_HPP +#define KOKKOS_ANALYZESHAPE_HPP + +#include <impl/Kokkos_Shape.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- + +/** \brief Analyze the array shape defined by a Kokkos::View data type. + * + * It is presumed that the data type can be mapped down to a multidimensional + * array of an intrinsic scalar numerical type (double, float, int, ... ). + * The 'value_type' of an array may be an embedded aggregate type such + * as a fixed length array 'Array<T,N>'. + * In this case the 'array_intrinsic_type' represents the + * underlying array of intrinsic scalar numerical type. + * + * The embedded aggregate type must have an AnalyzeShape specialization + * to map it down to a shape and intrinsic scalar numerical type. + */ +template< class T > +struct AnalyzeShape : public Shape< sizeof(T) , 0 > +{ + typedef void specialize ; + + typedef Shape< sizeof(T), 0 > shape ; + + typedef T array_intrinsic_type ; + typedef T value_type ; + typedef T type ; + + typedef const T const_array_intrinsic_type ; + typedef const T const_value_type ; + typedef const T const_type ; + + typedef T non_const_array_intrinsic_type ; + typedef T non_const_value_type ; + typedef T non_const_type ; +}; + +template<> +struct AnalyzeShape<void> : public Shape< 0 , 0 > +{ + typedef void specialize ; + + typedef Shape< 0 , 0 > shape ; + + typedef void array_intrinsic_type ; + typedef void value_type ; + typedef void type ; + typedef const void const_array_intrinsic_type ; + typedef const void const_value_type ; + typedef const void const_type ; + typedef void non_const_array_intrinsic_type ; + typedef void non_const_value_type ; + typedef void non_const_type ; +}; + +template< class T > +struct AnalyzeShape< const T > : public AnalyzeShape<T>::shape +{ +private: + typedef AnalyzeShape<T> nested ; +public: + + typedef typename nested::specialize specialize ; + + typedef typename nested::shape shape ; + + typedef typename nested::const_array_intrinsic_type array_intrinsic_type ; + typedef typename nested::const_value_type value_type ; + typedef typename nested::const_type type ; + + typedef typename nested::const_array_intrinsic_type const_array_intrinsic_type ; + typedef typename nested::const_value_type const_value_type ; + typedef typename nested::const_type const_type ; + + typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type ; + typedef typename nested::non_const_value_type non_const_value_type ; + typedef typename nested::non_const_type non_const_type ; +}; + +template< class T > +struct AnalyzeShape< T * > + : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type +{ +private: + typedef AnalyzeShape<T> nested ; +public: + + typedef typename nested::specialize specialize ; + + typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ; + + typedef typename nested::array_intrinsic_type * array_intrinsic_type ; + typedef typename nested::value_type value_type ; + typedef typename nested::type * type ; + + typedef typename nested::const_array_intrinsic_type * const_array_intrinsic_type ; + typedef typename nested::const_value_type const_value_type ; + typedef typename nested::const_type * const_type ; + + typedef typename nested::non_const_array_intrinsic_type * non_const_array_intrinsic_type ; + typedef typename nested::non_const_value_type non_const_value_type ; + typedef typename nested::non_const_type * non_const_type ; +}; + +template< class T > +struct AnalyzeShape< T[] > + : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type +{ +private: + typedef AnalyzeShape<T> nested ; +public: + + typedef typename nested::specialize specialize ; + + typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ; + + typedef typename nested::array_intrinsic_type array_intrinsic_type [] ; + typedef typename nested::value_type value_type ; + typedef typename nested::type type [] ; + + typedef typename nested::const_array_intrinsic_type const_array_intrinsic_type [] ; + typedef typename nested::const_value_type const_value_type ; + typedef typename nested::const_type const_type [] ; + + typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type [] ; + typedef typename nested::non_const_value_type non_const_value_type ; + typedef typename nested::non_const_type non_const_type [] ; +}; + +template< class T > +struct AnalyzeShape< const T[] > + : public ShapeInsert< typename AnalyzeShape< const T >::shape , 0 >::type +{ +private: + typedef AnalyzeShape< const T > nested ; +public: + + typedef typename nested::specialize specialize ; + + typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ; + + typedef typename nested::array_intrinsic_type array_intrinsic_type [] ; + typedef typename nested::value_type value_type ; + typedef typename nested::type type [] ; + + typedef typename nested::const_array_intrinsic_type const_array_intrinsic_type [] ; + typedef typename nested::const_value_type const_value_type ; + typedef typename nested::const_type const_type [] ; + + typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type [] ; + typedef typename nested::non_const_value_type non_const_value_type ; + typedef typename nested::non_const_type non_const_type [] ; +}; + +template< class T , unsigned N > +struct AnalyzeShape< T[N] > + : public ShapeInsert< typename AnalyzeShape<T>::shape , N >::type +{ +private: + typedef AnalyzeShape<T> nested ; +public: + + typedef typename nested::specialize specialize ; + + typedef typename ShapeInsert< typename nested::shape , N >::type shape ; + + typedef typename nested::array_intrinsic_type array_intrinsic_type [N] ; + typedef typename nested::value_type value_type ; + typedef typename nested::type type [N] ; + + typedef typename nested::const_array_intrinsic_type const_array_intrinsic_type [N] ; + typedef typename nested::const_value_type const_value_type ; + typedef typename nested::const_type const_type [N] ; + + typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type [N] ; + typedef typename nested::non_const_value_type non_const_value_type ; + typedef typename nested::non_const_type non_const_type [N] ; +}; + +template< class T , unsigned N > +struct AnalyzeShape< const T[N] > + : public ShapeInsert< typename AnalyzeShape< const T >::shape , N >::type +{ +private: + typedef AnalyzeShape< const T > nested ; +public: + + typedef typename nested::specialize specialize ; + + typedef typename ShapeInsert< typename nested::shape , N >::type shape ; + + typedef typename nested::array_intrinsic_type array_intrinsic_type [N] ; + typedef typename nested::value_type value_type ; + typedef typename nested::type type [N] ; + + typedef typename nested::const_array_intrinsic_type const_array_intrinsic_type [N] ; + typedef typename nested::const_value_type const_value_type ; + typedef typename nested::const_type const_type [N] ; + + typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type [N] ; + typedef typename nested::non_const_value_type non_const_value_type ; + typedef typename nested::non_const_type non_const_type [N] ; +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_ANALYZESHAPE_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp new file mode 100755 index 0000000000000000000000000000000000000000..e9c7a16d585060bcc76e6bb133010bf45b4ea2d5 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp @@ -0,0 +1,214 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_X86_HPP ) +#define KOKKOS_ATOMIC_ASSEMBLY_X86_HPP +namespace Kokkos { + +#ifdef KOKKOS_ENABLE_ASM +#ifndef __CUDA_ARCH__ +template<> +KOKKOS_INLINE_FUNCTION +void atomic_increment<char>(volatile char* a) { + __asm__ __volatile__( + "lock incb %0" + : /* no output registers */ + : "m" (a[0]) + : "memory" + ); +} + +template<> +KOKKOS_INLINE_FUNCTION +void atomic_increment<short>(volatile short* a) { + __asm__ __volatile__( + "lock incw %0" + : /* no output registers */ + : "m" (a[0]) + : "memory" + ); +} + +template<> +KOKKOS_INLINE_FUNCTION +void atomic_increment<int>(volatile int* a) { + __asm__ __volatile__( + "lock incl %0" + : /* no output registers */ + : "m" (a[0]) + : "memory" + ); +} + +template<> +KOKKOS_INLINE_FUNCTION +void atomic_increment<long long int>(volatile long long int* a) { + __asm__ __volatile__( + "lock incq %0" + : /* no output registers */ + : "m" (a[0]) + : "memory" + ); +} + +template<> +KOKKOS_INLINE_FUNCTION +void atomic_decrement<char>(volatile char* a) { + __asm__ __volatile__( + "lock decb %0" + : /* no output registers */ + : "m" (a[0]) + : "memory" + ); +} + +template<> +KOKKOS_INLINE_FUNCTION +void atomic_decrement<short>(volatile short* a) { + __asm__ __volatile__( + "lock decw %0" + : /* no output registers */ + : "m" (a[0]) + : "memory" + ); +} + +template<> +KOKKOS_INLINE_FUNCTION +void atomic_decrement<int>(volatile int* a) { + __asm__ __volatile__( + "lock decl %0" + : /* no output registers */ + : "m" (a[0]) + : "memory" + ); +} + +template<> +KOKKOS_INLINE_FUNCTION +void atomic_decrement<long long int>(volatile long long int* a) { + __asm__ __volatile__( + "lock decq %0" + : /* no output registers */ + : "m" (a[0]) + : "memory" + ); +} +#endif +#endif + +namespace Impl { + struct cas128_t + { + uint64_t lower; + uint64_t upper; + + KOKKOS_INLINE_FUNCTION + cas128_t () { + lower = 0; + upper = 0; + } + + KOKKOS_INLINE_FUNCTION + cas128_t (const cas128_t& a) { + lower = a.lower; + upper = a.upper; + } + KOKKOS_INLINE_FUNCTION + cas128_t (volatile cas128_t* a) { + lower = a->lower; + upper = a->upper; + } + + KOKKOS_INLINE_FUNCTION + bool operator != (const cas128_t& a) const { + return (lower != a.lower) || upper!=a.upper; + } + + KOKKOS_INLINE_FUNCTION + void operator = (const cas128_t& a) { + lower = a.lower; + upper = a.upper; + } + KOKKOS_INLINE_FUNCTION + void operator = (const cas128_t& a) volatile { + lower = a.lower; + upper = a.upper; + } + } + __attribute__ (( __aligned__( 16 ) )); + + + + + inline cas128_t cas128( volatile cas128_t * ptr, cas128_t cmp, cas128_t swap ) + { + #ifdef KOKKOS_ENABLE_ASM + bool swapped = false; + __asm__ __volatile__ + ( + "lock cmpxchg16b %1\n\t" + "setz %0" + : "=q" ( swapped ) + , "+m" ( *ptr ) + , "+d" ( cmp.upper ) + , "+a" ( cmp.lower ) + : "c" ( swap.upper ) + , "b" ( swap.lower ) + , "q" ( swapped ) + ); + return cmp; + #else + cas128_t tmp(ptr); + if(tmp != cmp) { + return tmp; + } else { + *ptr = swap; + return swap; + } + #endif + } + +} +} + +#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp new file mode 100755 index 0000000000000000000000000000000000000000..524cd7327d6f657156f45fc80b61564935582b74 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp @@ -0,0 +1,259 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP ) +#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP + +namespace Kokkos { + +//---------------------------------------------------------------------------- +// Cuda native CAS supports int, unsigned int, and unsigned long long int (non-standard type). +// Must cast-away 'volatile' for the CAS call. + +#if defined( KOKKOS_ATOMICS_USE_CUDA ) + +__inline__ __device__ +int atomic_compare_exchange( volatile int * const dest, const int compare, const int val) +{ return atomicCAS((int*)dest,compare,val); } + +__inline__ __device__ +unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val) +{ return atomicCAS((unsigned int*)dest,compare,val); } + +__inline__ __device__ +unsigned long long int atomic_compare_exchange( volatile unsigned long long int * const dest , + const unsigned long long int compare , + const unsigned long long int val ) +{ return atomicCAS((unsigned long long int*)dest,compare,val); } + +template < typename T > +__inline__ __device__ +T atomic_compare_exchange( volatile T * const dest , const T & compare , + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val ) +{ + const int tmp = atomicCAS( (int*) dest , *((int*)&compare) , *((int*)&val) ); + return *((T*)&tmp); +} + +template < typename T > +__inline__ __device__ +T atomic_compare_exchange( volatile T * const dest , const T & compare , + typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int) , const T & >::type val ) +{ + typedef unsigned long long int type ; + const type tmp = atomicCAS( (type*) dest , *((type*)&compare) , *((type*)&val) ); + return *((T*)&tmp); +} + +template < typename T > +__inline__ __device__ +T atomic_compare_exchange( volatile T * const dest , const T & compare , + typename ::Kokkos::Impl::enable_if< + ( sizeof(T) != 4 ) + && ( sizeof(T) != 8 ) + , const T >::type& val ) +{ + T return_val; + // This is a way to (hopefully) avoid dead lock in a warp + bool done = false; + while (! done ) { + if( Impl::lock_address_cuda_space( (void*) dest ) ) { + return_val = *dest; + if( return_val == compare ) + *dest = val; + Impl::unlock_address_cuda_space( (void*) dest ); + } + } + return return_val; +} + +//---------------------------------------------------------------------------- +// GCC native CAS supports int, long, unsigned int, unsigned long. +// Intel native CAS support int and long with the same interface as GCC. + +#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL) + +KOKKOS_INLINE_FUNCTION +int atomic_compare_exchange( volatile int * const dest, const int compare, const int val) +{ return __sync_val_compare_and_swap(dest,compare,val); } + +KOKKOS_INLINE_FUNCTION +long atomic_compare_exchange( volatile long * const dest, const long compare, const long val ) +{ return __sync_val_compare_and_swap(dest,compare,val); } + +#if defined( KOKKOS_ATOMICS_USE_GCC ) + +// GCC supports unsigned + +KOKKOS_INLINE_FUNCTION +unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val ) +{ return __sync_val_compare_and_swap(dest,compare,val); } + +KOKKOS_INLINE_FUNCTION +unsigned long atomic_compare_exchange( volatile unsigned long * const dest , + const unsigned long compare , + const unsigned long val ) +{ return __sync_val_compare_and_swap(dest,compare,val); } + +#endif + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_compare_exchange( volatile T * const dest, const T & compare, + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val ) +{ +#ifdef KOKKOS_HAVE_CXX11 + union U { + int i ; + T t ; + KOKKOS_INLINE_FUNCTION U() {}; + } tmp ; +#else + union U { + int i ; + T t ; + } tmp ; +#endif + + tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) ); + return tmp.t ; +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_compare_exchange( volatile T * const dest, const T & compare, + typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(long) , const T & >::type val ) +{ +#ifdef KOKKOS_HAVE_CXX11 + union U { + long i ; + T t ; + KOKKOS_INLINE_FUNCTION U() {}; + } tmp ; +#else + union U { + long i ; + T t ; + } tmp ; +#endif + + tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) ); + return tmp.t ; +} + +#ifdef KOKKOS_ENABLE_ASM +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_compare_exchange( volatile T * const dest, const T & compare, + typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(long) && + sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val ) +{ + union U { + Impl::cas128_t i ; + T t ; + KOKKOS_INLINE_FUNCTION U() {}; + } tmp ; + + tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) ); + return tmp.t ; +} +#endif + +template < typename T > +inline +T atomic_compare_exchange( volatile T * const dest , const T compare , + typename ::Kokkos::Impl::enable_if< + ( sizeof(T) != 4 ) + && ( sizeof(T) != 8 ) + #if defined(KOKKOS_ENABLE_ASM) + && ( sizeof(T) != 16 ) + #endif + , const T >::type& val ) +{ + while( !Impl::lock_address_host_space( (void*) dest ) ); + T return_val = *dest; + if( return_val == compare ) { + const T tmp = *dest = val; + #ifndef KOKKOS_COMPILER_CLANG + (void) tmp; + #endif + } + Impl::unlock_address_host_space( (void*) dest ); + return return_val; +} +//---------------------------------------------------------------------------- + +#elif defined( KOKKOS_ATOMICS_USE_OMP31 ) + +template< typename T > +KOKKOS_INLINE_FUNCTION +T atomic_compare_exchange( volatile T * const dest, const T compare, const T val ) +{ + T retval; +#pragma omp critical + { + retval = dest[0]; + if ( retval == compare ) + dest[0] = val; + } + return retval; +} + +#endif + +template <typename T> +KOKKOS_INLINE_FUNCTION +bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val) +{ + return compare == atomic_compare_exchange(dest, compare, val); +} + +//---------------------------------------------------------------------------- + +} // namespace Kokkos + +#endif + diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp new file mode 100755 index 0000000000000000000000000000000000000000..1bdbdbc7f904e7ef284d818015b9c059033ca2a6 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp @@ -0,0 +1,340 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP ) +#define KOKKOS_ATOMIC_EXCHANGE_HPP + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_ATOMICS_USE_CUDA ) + +__inline__ __device__ +int atomic_exchange( volatile int * const dest , const int val ) +{ + // return __iAtomicExch( (int*) dest , val ); + return atomicExch( (int*) dest , val ); +} + +__inline__ __device__ +unsigned int atomic_exchange( volatile unsigned int * const dest , const unsigned int val ) +{ + // return __uAtomicExch( (unsigned int*) dest , val ); + return atomicExch( (unsigned int*) dest , val ); +} + +__inline__ __device__ +unsigned long long int atomic_exchange( volatile unsigned long long int * const dest , const unsigned long long int val ) +{ + // return __ullAtomicExch( (unsigned long long*) dest , val ); + return atomicExch( (unsigned long long*) dest , val ); +} + +/** \brief Atomic exchange for any type with compatible size */ +template< typename T > +__inline__ __device__ +T atomic_exchange( + volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val ) +{ + // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) ); + int tmp = atomicExch( ((int*)dest) , *((int*)&val) ); + return *((T*)&tmp); +} + +template< typename T > +__inline__ __device__ +T atomic_exchange( + volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int) , const T & >::type val ) +{ + typedef unsigned long long int type ; + // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) ); + type tmp = atomicExch( ((type*)dest) , *((type*)&val) ); + return *((T*)&tmp); +} + +template < typename T > +__inline__ __device__ +T atomic_exchange( volatile T * const dest , + typename ::Kokkos::Impl::enable_if< + ( sizeof(T) != 4 ) + && ( sizeof(T) != 8 ) + , const T >::type& val ) +{ + T return_val; + // This is a way to (hopefully) avoid dead lock in a warp + bool done = false; + while (! done ) { + if( Impl::lock_address_cuda_space( (void*) dest ) ) { + return_val = *dest; + *dest = val; + Impl::unlock_address_cuda_space( (void*) dest ); + } + } + return return_val; +} +/** \brief Atomic exchange for any type with compatible size */ +template< typename T > +__inline__ __device__ +void atomic_assign( + volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val ) +{ + // (void) __ullAtomicExch( (int*) dest , *((int*)&val) ); + (void) atomicExch( ((int*)dest) , *((int*)&val) ); +} + +template< typename T > +__inline__ __device__ +void atomic_assign( + volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int) , const T & >::type val ) +{ + typedef unsigned long long int type ; + // (void) __ullAtomicExch( (type*) dest , *((type*)&val) ); + (void) atomicExch( ((type*)dest) , *((type*)&val) ); +} + +template< typename T > +__inline__ __device__ +void atomic_assign( + volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(unsigned long long int) + , const T & >::type val ) +{ + (void) atomic_exchange(dest,val); +} + +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL) + +template< typename T > +KOKKOS_INLINE_FUNCTION +T atomic_exchange( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long) + , const T & >::type val ) +{ + typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ; + + const type v = *((type*)&val); // Extract to be sure the value doesn't change + + type assumed ; + +#ifdef KOKKOS_HAVE_CXX11 + union U { + T val_T ; + type val_type ; + KOKKOS_INLINE_FUNCTION U() {}; + } old ; +#else + union { T val_T ; type val_type ; } old ; +#endif + + old.val_T = *dest ; + + do { + assumed = old.val_type ; + old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v ); + } while ( assumed != old.val_type ); + + return old.val_T ; +} + +#if defined(KOKKOS_ENABLE_ASM) +template< typename T > +KOKKOS_INLINE_FUNCTION +T atomic_exchange( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t) + , const T & >::type val ) +{ + union U { + Impl::cas128_t i ; + T t ; + KOKKOS_INLINE_FUNCTION U() {}; + } assume , oldval , newval ; + + oldval.t = *dest ; + newval.t = val; + + do { + assume.i = oldval.i ; + oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return oldval.t ; +} +#endif + +//---------------------------------------------------------------------------- + +template < typename T > +inline +T atomic_exchange( volatile T * const dest , + typename ::Kokkos::Impl::enable_if< + ( sizeof(T) != 4 ) + && ( sizeof(T) != 8 ) + #if defined(KOKKOS_ENABLE_ASM) + && ( sizeof(T) != 16 ) + #endif + , const T >::type& val ) +{ + while( !Impl::lock_address_host_space( (void*) dest ) ); + T return_val = *dest; + const T tmp = *dest = val; + #ifndef KOKKOS_COMPILER_CLANG + (void) tmp; + #endif + Impl::unlock_address_host_space( (void*) dest ); + return return_val; +} + +template< typename T > +KOKKOS_INLINE_FUNCTION +void atomic_assign( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long) + , const T & >::type val ) +{ + typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ; + + const type v = *((type*)&val); // Extract to be sure the value doesn't change + + type assumed ; + +#ifdef KOKKOS_HAVE_CXX11 + union U { + T val_T ; + type val_type ; + KOKKOS_INLINE_FUNCTION U() {}; + } old ; +#else + union { T val_T ; type val_type ; } old ; +#endif + + old.val_T = *dest ; + + do { + assumed = old.val_type ; + old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v ); + } while ( assumed != old.val_type ); +} + +#ifdef KOKKOS_ENABLE_ASM +template< typename T > +KOKKOS_INLINE_FUNCTION +void atomic_assign( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t) + , const T & >::type val ) +{ + union U { + Impl::cas128_t i ; + T t ; + KOKKOS_INLINE_FUNCTION U() {}; + } assume , oldval , newval ; + + oldval.t = *dest ; + newval.t = val; + do { + assume.i = oldval.i ; + oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i); + } while ( assume.i != oldval.i ); +} +#endif + +template < typename T > +inline +void atomic_assign( volatile T * const dest , + typename ::Kokkos::Impl::enable_if< + ( sizeof(T) != 4 ) + && ( sizeof(T) != 8 ) + #if defined(KOKKOS_ENABLE_ASM) + && ( sizeof(T) != 16 ) + #endif + , const T >::type& val ) +{ + while( !Impl::lock_address_host_space( (void*) dest ) ); + *dest = val; + Impl::unlock_address_host_space( (void*) dest ); +} +//---------------------------------------------------------------------------- + +#elif defined( KOKKOS_ATOMICS_USE_OMP31 ) + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_exchange( volatile T * const dest , const T val ) +{ + T retval; +//#pragma omp atomic capture + #pragma omp critical + { + retval = dest[0]; + dest[0] = val; + } + return retval; +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +void atomic_assign( volatile T * const dest , const T val ) +{ +//#pragma omp atomic + #pragma omp critical + { + dest[0] = val; + } +} + +#endif + +} // namespace Kokkos + +#endif + +//---------------------------------------------------------------------------- + diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp new file mode 100755 index 0000000000000000000000000000000000000000..b06a5b424313d1b9a943de94b38d27f1158d74ca --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp @@ -0,0 +1,326 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP ) +#define KOKKOS_ATOMIC_FETCH_ADD_HPP + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_ATOMICS_USE_CUDA ) + +// Support for int, unsigned int, unsigned long long int, and float + +__inline__ __device__ +int atomic_fetch_add( volatile int * const dest , const int val ) +{ return atomicAdd((int*)dest,val); } + +__inline__ __device__ +unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val ) +{ return atomicAdd((unsigned int*)dest,val); } + +__inline__ __device__ +unsigned long long int atomic_fetch_add( volatile unsigned long long int * const dest , + const unsigned long long int val ) +{ return atomicAdd((unsigned long long int*)dest,val); } + +__inline__ __device__ +float atomic_fetch_add( volatile float * const dest , const float val ) +{ return atomicAdd((float*)dest,val); } + +template < typename T > +__inline__ __device__ +T atomic_fetch_add( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val ) +{ +#ifdef KOKKOS_HAVE_CXX11 + union U { + int i ; + T t ; + KOKKOS_INLINE_FUNCTION U() {}; + } assume , oldval , newval ; +#else + union U { + int i ; + T t ; + } assume , oldval , newval ; +#endif + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = assume.t + val ; + oldval.i = atomicCAS( (int*)dest , assume.i , newval.i ); + } while ( assumed.i != oldval.i ); + + return oldval.t ; +} + +template < typename T > +__inline__ __device__ +T atomic_fetch_add( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int) , const T >::type val ) +{ +#ifdef KOKKOS_HAVE_CXX11 + union U { + unsigned long long int i ; + T t ; + KOKKOS_INLINE_FUNCTION U() {}; + } assume , oldval , newval ; +#else + union U { + unsigned long long int i ; + T t ; + } assume , oldval , newval ; +#endif + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = assume.t + val ; + oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return oldval.t ; +} + +//---------------------------------------------------------------------------- + +template < typename T > +__inline__ __device__ +T atomic_fetch_add( volatile T * const dest , + typename ::Kokkos::Impl::enable_if< + ( sizeof(T) != 4 ) + && ( sizeof(T) != 8 ) + , const T >::type& val ) +{ + T return_val; + // This is a way to (hopefully) avoid dead lock in a warp + bool done = false; + while (! done ) { + if( Impl::lock_address_cuda_space( (void*) dest ) ) { + return_val = *dest; + *dest = return_val + val; + Impl::unlock_address_cuda_space( (void*) dest ); + } + } + return return_val; +} +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL) + +KOKKOS_INLINE_FUNCTION +int atomic_fetch_add( volatile int * const dest , const int val ) +{ return __sync_fetch_and_add(dest,val); } + +KOKKOS_INLINE_FUNCTION +long int atomic_fetch_add( volatile long int * const dest , const long int val ) +{ return __sync_fetch_and_add(dest,val); } + +#if defined( KOKKOS_ATOMICS_USE_GCC ) + +KOKKOS_INLINE_FUNCTION +unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val ) +{ return __sync_fetch_and_add(dest,val); } + +KOKKOS_INLINE_FUNCTION +unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val ) +{ return __sync_fetch_and_add(dest,val); } + +#endif + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_add( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val ) +{ +#ifdef KOKKOS_HAVE_CXX11 + union U { + int i ; + T t ; + KOKKOS_INLINE_FUNCTION U() {}; + } assume , oldval , newval ; +#else + union U { + int i ; + T t ; + } assume , oldval , newval ; +#endif + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = assume.t + val ; + oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return oldval.t ; +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_add( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(long) , const T >::type val ) +{ +#ifdef KOKKOS_HAVE_CXX11 + union U { + long i ; + T t ; + KOKKOS_INLINE_FUNCTION U() {}; + } assume , oldval , newval ; +#else + union U { + long i ; + T t ; + } assume , oldval , newval ; +#endif + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = assume.t + val ; + oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return oldval.t ; +} + +#ifdef KOKKOS_ENABLE_ASM +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_add( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(long) && + sizeof(T) == sizeof(Impl::cas128_t) , const T >::type val ) +{ + union U { + Impl::cas128_t i ; + T t ; + KOKKOS_INLINE_FUNCTION U() {}; + } assume , oldval , newval ; + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = assume.t + val ; + oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return oldval.t ; +} +#endif + +//---------------------------------------------------------------------------- + +template < typename T > +inline +T atomic_fetch_add( volatile T * const dest , + typename ::Kokkos::Impl::enable_if< + ( sizeof(T) != 4 ) + && ( sizeof(T) != 8 ) + #if defined(KOKKOS_ENABLE_ASM) + && ( sizeof(T) != 16 ) + #endif + , const T >::type& val ) +{ + while( !Impl::lock_address_host_space( (void*) dest ) ); + T return_val = *dest; + const T tmp = *dest = return_val + val; + #ifndef KOKKOS_COMPILER_CLANG + (void) tmp; + #endif + Impl::unlock_address_host_space( (void*) dest ); + return return_val; +} +//---------------------------------------------------------------------------- + +#elif defined( KOKKOS_ATOMICS_USE_OMP31 ) + +template< typename T > +T atomic_fetch_add( volatile T * const dest , const T val ) +{ + T retval; +#pragma omp atomic capture + { + retval = dest[0]; + dest[0] += val; + } + return retval; +} + +#endif + +//---------------------------------------------------------------------------- + +// Simpler version of atomic_fetch_add without the fetch +template <typename T> +KOKKOS_INLINE_FUNCTION +void atomic_add(volatile T * const dest, const T src) { + atomic_fetch_add(dest,src); +} + +// Atomic increment +template<typename T> +KOKKOS_INLINE_FUNCTION +void atomic_increment(volatile T* a) { + Kokkos::atomic_fetch_add(a,1); +} + +template<typename T> +KOKKOS_INLINE_FUNCTION +void atomic_decrement(volatile T* a) { + Kokkos::atomic_fetch_add(a,-1); +} + +} +#endif + diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp new file mode 100755 index 0000000000000000000000000000000000000000..9b7ebae4ac6df12bae659e50aa7da34429ac3187 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp @@ -0,0 +1,125 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP ) +#define KOKKOS_ATOMIC_FETCH_AND_HPP + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_ATOMICS_USE_CUDA ) + +// Support for int, unsigned int, unsigned long long int, and float + +__inline__ __device__ +int atomic_fetch_and( volatile int * const dest , const int val ) +{ return atomicAnd((int*)dest,val); } + +__inline__ __device__ +unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val ) +{ return atomicAnd((unsigned int*)dest,val); } + +#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ ) +__inline__ __device__ +unsigned long long int atomic_fetch_and( volatile unsigned long long int * const dest , + const unsigned long long int val ) +{ return atomicAnd((unsigned long long int*)dest,val); } +#endif + +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL) + +KOKKOS_INLINE_FUNCTION +int atomic_fetch_and( volatile int * const dest , const int val ) +{ return __sync_fetch_and_and(dest,val); } + +KOKKOS_INLINE_FUNCTION +long int atomic_fetch_and( volatile long int * const dest , const long int val ) +{ return __sync_fetch_and_and(dest,val); } + +#if defined( KOKKOS_ATOMICS_USE_GCC ) + +KOKKOS_INLINE_FUNCTION +unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val ) +{ return __sync_fetch_and_and(dest,val); } + +KOKKOS_INLINE_FUNCTION +unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val ) +{ return __sync_fetch_and_and(dest,val); } + +#endif + +//---------------------------------------------------------------------------- + +#elif defined( KOKKOS_ATOMICS_USE_OMP31 ) + +template< typename T > +T atomic_fetch_and( volatile T * const dest , const T val ) +{ + T retval; +#pragma omp atomic capture + { + retval = dest[0]; + dest[0] &= val; + } + return retval; +} + +#endif + +//---------------------------------------------------------------------------- + +// Simpler version of atomic_fetch_and without the fetch +template <typename T> +KOKKOS_INLINE_FUNCTION +void atomic_and(volatile T * const dest, const T src) { + (void)atomic_fetch_and(dest,src); +} + +} + +#endif + + diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp new file mode 100755 index 0000000000000000000000000000000000000000..f15e61a3aea2ac2e7120d88a7151390cc2bf0b73 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp @@ -0,0 +1,125 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP ) +#define KOKKOS_ATOMIC_FETCH_OR_HPP + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_ATOMICS_USE_CUDA ) + +// Support for int, unsigned int, unsigned long long int, and float + +__inline__ __device__ +int atomic_fetch_or( volatile int * const dest , const int val ) +{ return atomicOr((int*)dest,val); } + +__inline__ __device__ +unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val ) +{ return atomicOr((unsigned int*)dest,val); } + +#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ ) +__inline__ __device__ +unsigned long long int atomic_fetch_or( volatile unsigned long long int * const dest , + const unsigned long long int val ) +{ return atomicOr((unsigned long long int*)dest,val); } +#endif + +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL) + +KOKKOS_INLINE_FUNCTION +int atomic_fetch_or( volatile int * const dest , const int val ) +{ return __sync_fetch_and_or(dest,val); } + +KOKKOS_INLINE_FUNCTION +long int atomic_fetch_or( volatile long int * const dest , const long int val ) +{ return __sync_fetch_and_or(dest,val); } + +#if defined( KOKKOS_ATOMICS_USE_GCC ) + +KOKKOS_INLINE_FUNCTION +unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val ) +{ return __sync_fetch_and_or(dest,val); } + +KOKKOS_INLINE_FUNCTION +unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val ) +{ return __sync_fetch_and_or(dest,val); } + +#endif + +//---------------------------------------------------------------------------- + +#elif defined( KOKKOS_ATOMICS_USE_OMP31 ) + +template< typename T > +T atomic_fetch_or( volatile T * const dest , const T val ) +{ + T retval; +#pragma omp atomic capture + { + retval = dest[0]; + dest[0] |= val; + } + return retval; +} + +#endif + +//---------------------------------------------------------------------------- + +// Simpler version of atomic_fetch_or without the fetch +template <typename T> +KOKKOS_INLINE_FUNCTION +void atomic_or(volatile T * const dest, const T src) { + (void)atomic_fetch_or(dest,src); +} + +} + +#endif + + diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp new file mode 100755 index 0000000000000000000000000000000000000000..259cba794ac6776f562260a6c3bb69a6afc67308 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp @@ -0,0 +1,233 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP ) +#define KOKKOS_ATOMIC_FETCH_SUB_HPP + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_ATOMICS_USE_CUDA ) + +// Support for int, unsigned int, unsigned long long int, and float + +__inline__ __device__ +int atomic_fetch_sub( volatile int * const dest , const int val ) +{ return atomicSub((int*)dest,val); } + +__inline__ __device__ +unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val ) +{ return atomicSub((unsigned int*)dest,val); } + +template < typename T > +__inline__ __device__ +T atomic_fetch_sub( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val ) +{ + union { int i ; T t ; } oldval , assume , newval ; + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = assume.t - val ; + oldval.i = atomicCAS( (int*)dest , assume.i , newval.i ); + } while ( assumed.i != oldval.i ); + + return oldval.t ; +} + +template < typename T > +__inline__ __device__ +T atomic_fetch_sub( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int) , const T >::type val ) +{ + union { unsigned long long int i ; T t ; } oldval , assume , newval ; + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = assume.t - val ; + oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return oldval.t ; +} + + +//---------------------------------------------------------------------------- + +template < typename T > +__inline__ __device__ +T atomic_fetch_sub( volatile T * const dest , + typename ::Kokkos::Impl::enable_if< + ( sizeof(T) != 4 ) + && ( sizeof(T) != 8 ) + , const T >::type& val ) +{ + T return_val; + // This is a way to (hopefully) avoid dead lock in a warp + bool done = false; + while (! done ) { + if( Impl::lock_address_cuda_space( (void*) dest ) ) { + return_val = *dest; + *dest = return_val - val; + Impl::unlock_address_cuda_space( (void*) dest ); + } + } + return return_val; +} + +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL) + +KOKKOS_INLINE_FUNCTION +int atomic_fetch_sub( volatile int * const dest , const int val ) +{ return __sync_fetch_and_sub(dest,val); } + +KOKKOS_INLINE_FUNCTION +long int atomic_fetch_sub( volatile long int * const dest , const long int val ) +{ return __sync_fetch_and_sub(dest,val); } + +#if defined( KOKKOS_ATOMICS_USE_GCC ) + +KOKKOS_INLINE_FUNCTION +unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val ) +{ return __sync_fetch_and_sub(dest,val); } + +KOKKOS_INLINE_FUNCTION +unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val ) +{ return __sync_fetch_and_sub(dest,val); } + +#endif + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_sub( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val ) +{ + union { int i ; T t ; } assume , oldval , newval ; + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = assume.t - val ; + oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return oldval.t ; +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_sub( volatile T * const dest , + typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(long) , const T >::type val ) +{ + union { long i ; T t ; } assume , oldval , newval ; + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = assume.t - val ; + oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return oldval.t ; +} + + +//---------------------------------------------------------------------------- + +template < typename T > +inline +T atomic_fetch_sub( volatile T * const dest , + typename ::Kokkos::Impl::enable_if< + ( sizeof(T) != 4 ) + && ( sizeof(T) != 8 ) + , const T >::type& val ) +{ + while( !Impl::lock_address_host_space( (void*) dest ) ); + T return_val = *dest; + *dest = return_val - val; + Impl::unlock_address_host_space( (void*) dest ); + return return_val; +} + +//---------------------------------------------------------------------------- + +#elif defined( KOKKOS_ATOMICS_USE_OMP31 ) + +template< typename T > +T atomic_fetch_sub( volatile T * const dest , const T val ) +{ + T retval; +#pragma omp atomic capture + { + retval = dest[0]; + dest[0] -= val; + } + return retval; +} + +#endif + +// Simpler version of atomic_fetch_sub without the fetch +template <typename T> +KOKKOS_INLINE_FUNCTION +void atomic_sub(volatile T * const dest, const T src) { + atomic_fetch_sub(dest,src); +} + +} + +#include<impl/Kokkos_Atomic_Assembly_X86.hpp> +#endif + + diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp new file mode 100755 index 0000000000000000000000000000000000000000..bd968633bb69a8aec9bf8650558c5b140b9c504f --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp @@ -0,0 +1,375 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP ) +#define KOKKOS_ATOMIC_GENERIC_HPP +#include <Kokkos_Macros.hpp> + +// Combination operands to be used in an Compare and Exchange based atomic operation +namespace Kokkos { +namespace Impl { + +template<class Scalar1, class Scalar2> +struct AddOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1+val2; + } +}; + +template<class Scalar1, class Scalar2> +struct SubOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1-val2; + } +}; + +template<class Scalar1, class Scalar2> +struct MulOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1*val2; + } +}; + +template<class Scalar1, class Scalar2> +struct DivOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1/val2; + } +}; + +template<class Scalar1, class Scalar2> +struct ModOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1%val2; + } +}; + +template<class Scalar1, class Scalar2> +struct AndOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1&val2; + } +}; + +template<class Scalar1, class Scalar2> +struct OrOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1|val2; + } +}; + +template<class Scalar1, class Scalar2> +struct XorOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1^val2; + } +}; + +template<class Scalar1, class Scalar2> +struct LShiftOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1<<val2; + } +}; + +template<class Scalar1, class Scalar2> +struct RShiftOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1>>val2; + } +}; + +template < class Oper, typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_oper( const Oper& op, volatile T * const dest , + typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int) , const T >::type val ) +{ + union { unsigned long long int i ; T t ; } oldval , assume , newval ; + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = Oper::apply(assume.t, val) ; + oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return oldval.t ; +} + +template < class Oper, typename T > +KOKKOS_INLINE_FUNCTION +T atomic_oper_fetch( const Oper& op, volatile T * const dest , + typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int) , const T >::type val ) +{ + union { unsigned long long int i ; T t ; } oldval , assume , newval ; + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = Oper::apply(assume.t, val) ; + oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return newval.t ; +} + +template < class Oper, typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_oper( const Oper& op, volatile T * const dest , + typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val ) +{ + union { int i ; T t ; } oldval , assume , newval ; + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = Oper::apply(assume.t, val) ; + oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return oldval.t ; +} + +template < class Oper, typename T > +KOKKOS_INLINE_FUNCTION +T atomic_oper_fetch( const Oper& op, volatile T * const dest , + typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int), const T >::type val ) +{ + union { int i ; T t ; } oldval , assume , newval ; + + oldval.t = *dest ; + + do { + assume.i = oldval.i ; + newval.t = Oper::apply(assume.t, val) ; + oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i ); + } while ( assume.i != oldval.i ); + + return newval.t ; +} + +template < class Oper, typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_oper( const Oper& op, volatile T * const dest , + typename ::Kokkos::Impl::enable_if< + ( sizeof(T) != 4 ) + && ( sizeof(T) != 8 ) + #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + && ( sizeof(T) != 16 ) + #endif + , const T >::type val ) +{ + +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + while( !Impl::lock_address_host_space( (void*) dest ) ); + T return_val = *dest; + *dest = Oper::apply(return_val, val); + Impl::unlock_address_host_space( (void*) dest ); + return return_val; +#else + // This is a way to (hopefully) avoid dead lock in a warp + bool done = false; + while (! done ) { + if( Impl::lock_address_cuda_space( (void*) dest ) ) { + T return_val = *dest; + *dest = Oper::apply(return_val, val);; + Impl::unlock_address_cuda_space( (void*) dest ); + } + } + return return_val; +#endif +} + +template < class Oper, typename T > +KOKKOS_INLINE_FUNCTION +T atomic_oper_fetch( const Oper& op, volatile T * const dest , + typename ::Kokkos::Impl::enable_if< + ( sizeof(T) != 4 ) + && ( sizeof(T) != 8 ) + #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + && ( sizeof(T) != 16 ) + #endif + , const T >::type& val ) +{ + +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + while( !Impl::lock_address_host_space( (void*) dest ) ); + T return_val = Oper::apply(*dest, val); + *dest = return_val; + Impl::unlock_address_host_space( (void*) dest ); + return return_val; +#else + // This is a way to (hopefully) avoid dead lock in a warp + bool done = false; + while (! done ) { + if( Impl::lock_address_cuda_space( (void*) dest ) ) { + T return_val = Oper::apply(*dest, val); + *dest = return_val; + Impl::unlock_address_cuda_space( (void*) dest ); + } + } + return return_val; +#endif +} + +} +} + +namespace Kokkos { + +// Fetch_Oper atomics: return value before operation +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_mul(volatile T * const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::MulOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_div(volatile T * const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::DivOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_mod(volatile T * const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::ModOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_and(volatile T * const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::AndOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_or(volatile T * const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::OrOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_xor(volatile T * const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::XorOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_lshift(volatile T * const dest, const unsigned int val) { + return Impl::atomic_fetch_oper(Impl::LShiftOper<T,const unsigned int>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) { + return Impl::atomic_fetch_oper(Impl::RShiftOper<T,const unsigned int>(),dest,val); +} + + +// Oper Fetch atomics: return value after operation +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_mul_fetch(volatile T * const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::MulOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_div_fetch(volatile T * const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::DivOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_mod_fetch(volatile T * const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::ModOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_and_fetch(volatile T * const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::AndOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_or_fetch(volatile T * const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::OrOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_xor_fetch(volatile T * const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::XorOper<T,const T>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_lshift_fetch(volatile T * const dest, const unsigned int val) { + return Impl::atomic_oper_fetch(Impl::LShiftOper<T,const unsigned int>(),dest,val); +} + +template < typename T > +KOKKOS_INLINE_FUNCTION +T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) { + return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val); +} + + +} +#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp new file mode 100755 index 0000000000000000000000000000000000000000..f95ed67da97e3ada83dac18f8f3fc2dab04c7afb --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp @@ -0,0 +1,462 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOS_ATOMIC_VIEW_HPP +#define KOKKOS_ATOMIC_VIEW_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Atomic.hpp> + +namespace Kokkos { namespace Impl { + +class AllocationTracker; + +//The following tag is used to prevent an implicit call of the constructor when trying +//to assign a literal 0 int ( = 0 ); +struct AtomicViewConstTag {}; + +template<class ViewTraits> +class AtomicDataElement { +public: + typedef typename ViewTraits::value_type value_type; + typedef typename ViewTraits::const_value_type const_value_type; + typedef typename ViewTraits::non_const_value_type non_const_value_type; + volatile value_type* const ptr; + + KOKKOS_INLINE_FUNCTION + AtomicDataElement(value_type* ptr_, AtomicViewConstTag ):ptr(ptr_){} + + KOKKOS_INLINE_FUNCTION + const_value_type operator = (const_value_type& val) const { + *ptr = val; + return val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator = (volatile const_value_type& val) const { + *ptr = val; + return val; + } + + KOKKOS_INLINE_FUNCTION + void inc() const { + Kokkos::atomic_increment(ptr); + } + + KOKKOS_INLINE_FUNCTION + void dec() const { + Kokkos::atomic_decrement(ptr); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator ++ () const { + const_value_type tmp = Kokkos::atomic_fetch_add(ptr,1); + return tmp+1; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator -- () const { + const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-1); + return tmp-1; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator ++ (int) const { + return Kokkos::atomic_fetch_add(ptr,1); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator -- (int) const { + return Kokkos::atomic_fetch_add(ptr,-1); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator += (const_value_type& val) const { + const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val); + return tmp+val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator += (volatile const_value_type& val) const { + const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val); + return tmp+val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator -= (const_value_type& val) const { + const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val); + return tmp-val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator -= (volatile const_value_type& val) const { + const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val); + return tmp-val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator *= (const_value_type& val) const { + return Kokkos::atomic_mul_fetch(ptr,val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator *= (volatile const_value_type& val) const { + return Kokkos::atomic_mul_fetch(ptr,val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator /= (const_value_type& val) const { + return Kokkos::atomic_div_fetch(ptr,val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator /= (volatile const_value_type& val) const { + return Kokkos::atomic_div_fetch(ptr,val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator %= (const_value_type& val) const { + return Kokkos::atomic_mod_fetch(ptr,val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator %= (volatile const_value_type& val) const { + return Kokkos::atomic_mod_fetch(ptr,val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator &= (const_value_type& val) const { + return Kokkos::atomic_and_fetch(ptr,val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator &= (volatile const_value_type& val) const { + return Kokkos::atomic_and_fetch(ptr,val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator ^= (const_value_type& val) const { + return Kokkos::atomic_xor_fetch(ptr,val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator ^= (volatile const_value_type& val) const { + return Kokkos::atomic_xor_fetch(ptr,val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator |= (const_value_type& val) const { + return Kokkos::atomic_or_fetch(ptr,val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator |= (volatile const_value_type& val) const { + return Kokkos::atomic_or_fetch(ptr,val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator <<= (const_value_type& val) const { + return Kokkos::atomic_lshift_fetch(ptr,val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator <<= (volatile const_value_type& val) const { + return Kokkos::atomic_lshift_fetch(ptr,val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator >>= (const_value_type& val) const { + return Kokkos::atomic_rshift_fetch(ptr,val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator >>= (volatile const_value_type& val) const { + return Kokkos::atomic_rshift_fetch(ptr,val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator + (const_value_type& val) const { + return *ptr+val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator + (volatile const_value_type& val) const { + return *ptr+val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator - (const_value_type& val) const { + return *ptr-val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator - (volatile const_value_type& val) const { + return *ptr-val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator * (const_value_type& val) const { + return *ptr*val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator * (volatile const_value_type& val) const { + return *ptr*val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator / (const_value_type& val) const { + return *ptr/val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator / (volatile const_value_type& val) const { + return *ptr/val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator % (const_value_type& val) const { + return *ptr^val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator % (volatile const_value_type& val) const { + return *ptr^val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator ! () const { + return !*ptr; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator && (const_value_type& val) const { + return *ptr&&val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator && (volatile const_value_type& val) const { + return *ptr&&val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator || (const_value_type& val) const { + return *ptr|val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator || (volatile const_value_type& val) const { + return *ptr|val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator & (const_value_type& val) const { + return *ptr&val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator & (volatile const_value_type& val) const { + return *ptr&val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator | (const_value_type& val) const { + return *ptr|val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator | (volatile const_value_type& val) const { + return *ptr|val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator ^ (const_value_type& val) const { + return *ptr^val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator ^ (volatile const_value_type& val) const { + return *ptr^val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator ~ () const { + return ~*ptr; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator << (const unsigned int& val) const { + return *ptr<<val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator << (volatile const unsigned int& val) const { + return *ptr<<val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator >> (const unsigned int& val) const { + return *ptr>>val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator >> (volatile const unsigned int& val) const { + return *ptr>>val; + } + + KOKKOS_INLINE_FUNCTION + bool operator == (const_value_type& val) const { + return *ptr == val; + } + KOKKOS_INLINE_FUNCTION + bool operator == (volatile const_value_type& val) const { + return *ptr == val; + } + + KOKKOS_INLINE_FUNCTION + bool operator != (const_value_type& val) const { + return *ptr != val; + } + KOKKOS_INLINE_FUNCTION + bool operator != (volatile const_value_type& val) const { + return *ptr != val; + } + + KOKKOS_INLINE_FUNCTION + bool operator >= (const_value_type& val) const { + return *ptr >= val; + } + KOKKOS_INLINE_FUNCTION + bool operator >= (volatile const_value_type& val) const { + return *ptr >= val; + } + + KOKKOS_INLINE_FUNCTION + bool operator <= (const_value_type& val) const { + return *ptr <= val; + } + KOKKOS_INLINE_FUNCTION + bool operator <= (volatile const_value_type& val) const { + return *ptr <= val; + } + + KOKKOS_INLINE_FUNCTION + bool operator < (const_value_type& val) const { + return *ptr < val; + } + KOKKOS_INLINE_FUNCTION + bool operator < (volatile const_value_type& val) const { + return *ptr < val; + } + + KOKKOS_INLINE_FUNCTION + bool operator > (const_value_type& val) const { + return *ptr > val; + } + KOKKOS_INLINE_FUNCTION + bool operator > (volatile const_value_type& val) const { + return *ptr > val; + } + + KOKKOS_INLINE_FUNCTION + operator const_value_type () const { + //return Kokkos::atomic_load(ptr); + return *ptr; + } + + KOKKOS_INLINE_FUNCTION + operator volatile non_const_value_type () volatile const { + //return Kokkos::atomic_load(ptr); + return *ptr; + } +}; + +template<class ViewTraits> +class AtomicViewDataHandle { +public: + typename ViewTraits::value_type* ptr; + + KOKKOS_INLINE_FUNCTION + AtomicViewDataHandle() + : ptr(NULL) + {} + + KOKKOS_INLINE_FUNCTION + AtomicViewDataHandle(typename ViewTraits::value_type* ptr_) + :ptr(ptr_) + {} + + template<class iType> + KOKKOS_INLINE_FUNCTION + AtomicDataElement<ViewTraits> operator[] (const iType& i) const { + return AtomicDataElement<ViewTraits>(ptr+i,AtomicViewConstTag()); + } + + + KOKKOS_INLINE_FUNCTION + operator typename ViewTraits::value_type * () const { return ptr ; } + +}; + +template<unsigned Size> +struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars; + +template<> +struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<4> { + typedef int type; +}; + +template<> +struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> { + typedef int64_t type; +}; + +// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics. +template<class ViewTraits> +class ViewDataHandle< + ViewTraits , + typename enable_if< + ( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) && + ( ViewTraits::memory_traits::Atomic ) + >::type > +{ +private: +// typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) || +// (sizeof(typename ViewTraits::const_value_type)==8), +// int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type +// atomic_view_possible; + typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type; + typedef ViewDataHandle self_type; + +public: + enum { ReturnTypeIsReference = false }; + + typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type; + typedef Impl::AtomicDataElement<ViewTraits> return_type; + + KOKKOS_INLINE_FUNCTION + static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ ) + { + return handle_type(arg_data_ptr); + } +}; + +}} // namespace Kokkos::Impl + +#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp new file mode 100755 index 0000000000000000000000000000000000000000..62581569fbfebedbcc577c29837233123a8ec8a3 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp @@ -0,0 +1,211 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOS_ATOMIC_WINDOWS_HPP +#define KOKKOS_ATOMIC_WINDOWS_HPP +#ifdef _WIN32 + +#define NOMINMAX +#include <Windows.h> + +namespace Kokkos { + namespace Impl { + _declspec(align(16)) + struct cas128_t + { + LONGLONG lower; + LONGLONG upper; + KOKKOS_INLINE_FUNCTION + bool operator != (const cas128_t& a) const { + return (lower != a.lower) || upper != a.upper; + } + }; + } + +#ifdef KOKKOS_HAVE_CXX11 + template < typename T > + KOKKOS_INLINE_FUNCTION + T atomic_compare_exchange(volatile T * const dest, const T & compare, + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONG), const T & >::type val) + { + union U { + LONG i; + T t; + KOKKOS_INLINE_FUNCTION U() {}; + } tmp; + + tmp.i = _InterlockedCompareExchange((LONG*)dest, *((LONG*)&val), *((LONG*)&compare)); + return tmp.t; + } + + template < typename T > + KOKKOS_INLINE_FUNCTION + T atomic_compare_exchange(volatile T * const dest, const T & compare, + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONGLONG), const T & >::type val) + { + union U { + LONGLONG i; + T t; + KOKKOS_INLINE_FUNCTION U() {}; + } tmp; + + tmp.i = _InterlockedCompareExchange64((LONGLONG*)dest, *((LONGLONG*)&val), *((LONGLONG*)&compare)); + return tmp.t; + } + + template < typename T > + KOKKOS_INLINE_FUNCTION + T atomic_compare_exchange(volatile T * const dest, const T & compare, + typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val) + { + union U { + Impl::cas128_t i; + T t; + KOKKOS_INLINE_FUNCTION U() {}; + } tmp, newval; + newval.t = val; + tmp.i = _InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, newval.i.lower, *((LONGLONG*)&compare)); + return tmp.t; + } + + template< typename T > + T atomic_fetch_or(volatile T * const dest, const T val) { + T oldval = *dest; + T assume; + do { + assume = oldval; + T newval = val | oldval; + oldval = atomic_compare_exchange(dest, assume, newval); + } while (assume != oldval); + + return oldval; + } + + template< typename T > + T atomic_fetch_and(volatile T * const dest, const T val) { + T oldval = *dest; + T assume; + do { + assume = oldval; + T newval = val & oldval; + oldval = atomic_compare_exchange(dest, assume, newval); + } while (assume != oldval); + + return oldval; + } + + template< typename T > + T atomic_fetch_add(volatile T * const dest, const T val) { + T oldval = *dest; + T assume; + do { + assume = oldval; + T newval = val + oldval; + oldval = atomic_compare_exchange(dest, assume, newval); + } while (assume != oldval); + + return oldval; + } + + template< typename T > + T atomic_fetch_exchange(volatile T * const dest, const T val) { + T oldval = *dest; + T assume; + do { + assume = oldval; + oldval = atomic_compare_exchange(dest, assume, val); + } while (assume != oldval); + + return oldval; + } + + template< typename T > + void atomic_or(volatile T * const dest, const T val) { + atomic_fetch_or(dest, val); + } + + template< typename T > + void atomic_and(volatile T * const dest, const T val) { + atomic_fetch_and(dest, val); + } + + template< typename T > + void atomic_add(volatile T * const dest, const T val) { + atomic_fetch_add(dest, val); + } + + template< typename T > + void atomic_exchange(volatile T * const dest, const T val) { + atomic_fetch_exchange(dest, val); + } + + template< typename T > + void atomic_assign(volatile T * const dest, const T val) { + atomic_fetch_exchange(dest, val); + } + + template< typename T > + T atomic_increment(volatile T * const dest) { + T oldval = *dest; + T assume; + do { + assume = oldval; + T newval = assume++; + oldval = atomic_compare_exchange(dest, assume, newval); + } while (assume != oldval); + } + + template< typename T > + T atomic_decrement(volatile T * const dest) { + T oldval = *dest; + T assume; + do { + assume = oldval; + T newval = assume--; + oldval = atomic_compare_exchange(dest, assume, newval); + } while (assume != oldval); + } + +} +#endif +#endif +#endif \ No newline at end of file diff --git a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp new file mode 100755 index 0000000000000000000000000000000000000000..8da619fdba9b58bf16f6f23bd1a148bdd224a28d --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp @@ -0,0 +1,281 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_HostSpace.hpp> + +#include <impl/Kokkos_BasicAllocators.hpp> +#include <impl/Kokkos_Error.hpp> + + +#include <stdint.h> // uintptr_t +#include <cstdlib> // for malloc, realloc, and free +#include <cstring> // for memcpy +#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc +#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES + +#include <sstream> + +namespace Kokkos { namespace Impl { + +/*--------------------------------------------------------------------------*/ + +void* MallocAllocator::allocate( size_t size ) +{ + void * ptr = NULL; + if (size) { + ptr = malloc(size); + + if (!ptr) + { + std::ostringstream msg ; + msg << name() << ": allocate(" << size << ") FAILED"; + throw_runtime_exception( msg.str() ); + } + } + return ptr; +} + +void MallocAllocator::deallocate( void * ptr, size_t /*size*/ ) +{ + if (ptr) { + free(ptr); + } +} + +void * MallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size) +{ + void * ptr = realloc(old_ptr, new_size); + + if (new_size > 0u && ptr == NULL) { + throw_runtime_exception("Error: Malloc Allocator could not reallocate memory"); + } + return ptr; +} + +/*--------------------------------------------------------------------------*/ + +namespace { + +void * raw_aligned_allocate( size_t size, size_t alignment ) +{ + void * ptr = NULL; + if ( size ) { +#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA ) + ptr = _mm_malloc( size , alignment ); + +#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \ + ( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 ) + + posix_memalign( & ptr, alignment , size ); + +#else + // Over-allocate to and round up to guarantee proper alignment. + size_t size_padded = size + alignment + sizeof(void *); + void * alloc_ptr = malloc( size_padded ); + + if (alloc_ptr) { + uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr); + // offset enough to record the alloc_ptr + address += sizeof(void *); + uintptr_t rem = address % alignment; + uintptr_t offset = rem ? (alignment - rem) : 0u; + address += offset; + ptr = reinterpret_cast<void *>(address); + // record the alloc'd pointer + address -= sizeof(void *); + *reinterpret_cast<void **>(address) = alloc_ptr; + } +#endif + } + return ptr; +} + +void raw_aligned_deallocate( void * ptr, size_t /*size*/ ) +{ + if ( ptr ) { +#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA ) + _mm_free( ptr ); + +#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \ + ( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 ) + free( ptr ); +#else + // get the alloc'd pointer + void * alloc_ptr = *(reinterpret_cast<void **>(ptr) -1); + free( alloc_ptr ); +#endif + } + +} + +} + +void* AlignedAllocator::allocate( size_t size ) +{ + void * ptr = 0 ; + + if ( size ) { + ptr = raw_aligned_allocate(size, MEMORY_ALIGNMENT); + + if (!ptr) + { + std::ostringstream msg ; + msg << name() << ": allocate(" << size << ") FAILED"; + throw_runtime_exception( msg.str() ); + } + } + return ptr; +} + +void AlignedAllocator::deallocate( void * ptr, size_t size ) +{ + raw_aligned_deallocate( ptr, size); +} + +void * AlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size) +{ + void * ptr = old_ptr;; + + if (old_size < new_size) { + ptr = allocate( new_size ); + + memcpy(ptr, old_ptr, old_size ); + + deallocate( old_ptr, old_size ); + } + + return ptr; +} + +/*--------------------------------------------------------------------------*/ + +// mmap flags for private anonymous memory allocation +#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE ) + #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS) +#elif defined( MAP_ANON) && defined( MAP_PRIVATE ) + #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANON) +#else + #define NO_MMAP +#endif + +// huge page tables +#if !defined( NO_MMAP ) + #if defined( MAP_HUGETLB ) + #define MMAP_FLAGS_HUGE (MMAP_FLAGS | MAP_HUGETLB ) + #elif defined( MMAP_FLAGS ) + #define MMAP_FLAGS_HUGE MMAP_FLAGS + #endif + // threshold to use huge pages + #define MMAP_USE_HUGE_PAGES (1u << 27) +#endif + +// read write access to private memory +#if !defined( NO_MMAP ) + #define MMAP_PROTECTION (PROT_READ | PROT_WRITE) +#endif + + +void* PageAlignedAllocator::allocate( size_t size ) +{ + void *ptr = NULL; + if (size) { +#if !defined NO_MMAP + if ( size < MMAP_USE_HUGE_PAGES ) { + ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS, -1 /*file descriptor*/, 0 /*offset*/); + } else { + ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS_HUGE, -1 /*file descriptor*/, 0 /*offset*/); + } + if (ptr == MAP_FAILED) { + ptr = NULL; + } +#else + static const size_t page_size = 4096; // TODO: read in from sysconf( _SC_PAGE_SIZE ) + + ptr = raw_aligned_allocate( size, page_size); +#endif + if (!ptr) + { + std::ostringstream msg ; + msg << name() << ": allocate(" << size << ") FAILED"; + throw_runtime_exception( msg.str() ); + } + } + return ptr; +} + +void PageAlignedAllocator::deallocate( void * ptr, size_t size ) +{ +#if !defined( NO_MMAP ) + munmap(ptr, size); +#else + raw_aligned_deallocate(ptr, size); +#endif +} + +void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size) +{ + void * ptr = NULL; +#if defined( NO_MMAP ) || defined( __APPLE__ ) + + if (old_size != new_size) { + ptr = allocate( new_size ); + + memcpy(ptr, old_ptr, (old_size < new_size ? old_size : new_size) ); + + deallocate( old_ptr, old_size ); + } + else { + ptr = old_ptr; + } +#else + ptr = mremap( old_ptr, old_size, new_size, MREMAP_MAYMOVE ); + + if (ptr == MAP_FAILED) { + throw_runtime_exception("Error: Page Aligned Allocator could not reallocate memory"); + } +#endif + + return ptr; +} + +}} // namespace Kokkos::Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp new file mode 100755 index 0000000000000000000000000000000000000000..76377c5f159abe88272a2a73794bf899a4427aee --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp @@ -0,0 +1,118 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_BASIC_ALLOCATORS_HPP +#define KOKKOS_BASIC_ALLOCATORS_HPP + + +namespace Kokkos { namespace Impl { + +/// class UnmanagedAllocator +/// does nothing when deallocate(ptr,size) is called +class UnmanagedAllocator +{ +public: + static const char * name() { return "Unmanaged Allocator"; } + + static void deallocate(void * /*ptr*/, size_t /*size*/) {} +}; + + +/// class MallocAllocator +class MallocAllocator +{ +public: + static const char * name() + { + return "Malloc Allocator"; + } + + static void* allocate(size_t size); + + static void deallocate(void * ptr, size_t size); + + static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); +}; + + +/// class AlignedAllocator +/// memory aligned to Kokkos::Impl::MEMORY_ALIGNMENT +class AlignedAllocator +{ +public: + static const char * name() + { + return "Aligned Allocator"; + } + + static void* allocate(size_t size); + + static void deallocate(void * ptr, size_t size); + + static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); +}; + + +/// class PageAlignedAllocator +/// memory aligned to PAGE_SIZE +class PageAlignedAllocator +{ +public: + static const char * name() + { + return "Page Aligned Allocator"; + } + + static void* allocate(size_t size); + + static void deallocate(void * ptr, size_t size); + + static void * reallocate(void * old_ptr, size_t old_size, size_t new_size); +}; + + +}} // namespace Kokkos::Impl + +#endif //KOKKOS_BASIC_ALLOCATORS_HPP + + diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp new file mode 100755 index 0000000000000000000000000000000000000000..1c3c83cfe7c12c95889cee98c9be2c2bbc896f38 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -0,0 +1,447 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Error.hpp> +#include <cctype> +#include <cstring> +#include <iostream> +#include <cstdlib> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { +namespace { + +bool is_unsigned_int(const char* str) +{ + const size_t len = strlen (str); + for (size_t i = 0; i < len; ++i) { + if (! isdigit (str[i])) { + return false; + } + } + return true; +} + +void initialize_internal(const InitArguments& args) +{ + // Protect declarations, to prevent "unused variable" warnings. +#if defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD ) + const int num_threads = args.num_threads; + const int use_numa = args.num_numa; +#endif // defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD ) +#if defined( KOKKOS_HAVE_CUDA ) + const int use_gpu = args.device_id; +#endif // defined( KOKKOS_HAVE_CUDA ) + +#if defined( KOKKOS_HAVE_OPENMP ) + if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value || + Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) { + if(num_threads>0) { + if(use_numa>0) { + Kokkos::OpenMP::initialize(num_threads,use_numa); + } + else { + Kokkos::OpenMP::initialize(num_threads); + } + } else { + Kokkos::OpenMP::initialize(); + } + //std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ; + } + else { + //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ; + } +#endif + +#if defined( KOKKOS_HAVE_PTHREAD ) + if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value || + Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) { + if(num_threads>0) { + if(use_numa>0) { + Kokkos::Threads::initialize(num_threads,use_numa); + } + else { + Kokkos::Threads::initialize(num_threads); + } + } else { + Kokkos::Threads::initialize(); + } + //std::cout << "Kokkos::initialize() fyi: Pthread enabled and initialized" << std::endl ; + } + else { + //std::cout << "Kokkos::initialize() fyi: Pthread enabled but not initialized" << std::endl ; + } +#endif + +#if defined( KOKKOS_HAVE_SERIAL ) + // Prevent "unused variable" warning for 'args' input struct. If + // Serial::initialize() ever needs to take arguments from the input + // struct, you may remove this line of code. + (void) args; + + if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value || + Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) { + Kokkos::Serial::initialize(); + } +#endif + +#if defined( KOKKOS_HAVE_CUDA ) + if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) { + if (use_gpu > -1) { + Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( use_gpu ) ); + } + else { + Kokkos::Cuda::initialize(); + } + //std::cout << "Kokkos::initialize() fyi: Cuda enabled and initialized" << std::endl ; + } +#endif + +#ifdef KOKKOSP_ENABLE_PROFILING + Kokkos::Experimental::initialize(); +#endif +} + +void finalize_internal( const bool all_spaces = false ) +{ + +#if defined( KOKKOS_HAVE_CUDA ) + if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || all_spaces ) { + if(Kokkos::Cuda::is_initialized()) + Kokkos::Cuda::finalize(); + } +#endif + +#if defined( KOKKOS_HAVE_OPENMP ) + if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value || + Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value || + all_spaces ) { + if(Kokkos::OpenMP::is_initialized()) + Kokkos::OpenMP::finalize(); + } +#endif + +#if defined( KOKKOS_HAVE_PTHREAD ) + if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value || + Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value || + all_spaces ) { + if(Kokkos::Threads::is_initialized()) + Kokkos::Threads::finalize(); + } +#endif + +#if defined( KOKKOS_HAVE_SERIAL ) + if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value || + Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value || + all_spaces ) { + if(Kokkos::Serial::is_initialized()) + Kokkos::Serial::finalize(); + } +#endif + +#ifdef KOKKOSP_ENABLE_PROFILING + Kokkos::Experimental::finalize(); +#endif + +} + +void fence_internal() +{ + +#if defined( KOKKOS_HAVE_CUDA ) + if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value ) { + Kokkos::Cuda::fence(); + } +#endif + +#if defined( KOKKOS_HAVE_OPENMP ) + if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value || + Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) { + Kokkos::OpenMP::fence(); + } +#endif + +#if defined( KOKKOS_HAVE_PTHREAD ) + if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value || + Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) { + Kokkos::Threads::fence(); + } +#endif + +#if defined( KOKKOS_HAVE_SERIAL ) + if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value || + Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) { + Kokkos::Serial::fence(); + } +#endif + +} + +} // namespace +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +void initialize(int& narg, char* arg[]) +{ + int num_threads = -1; + int numa = -1; + int device = -1; + + int kokkos_threads_found = 0; + int kokkos_numa_found = 0; + int kokkos_device_found = 0; + int kokkos_ndevices_found = 0; + + int iarg = 0; + + while (iarg < narg) { + if ((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || (strncmp(arg[iarg],"--threads",9) == 0)) { + //Find the number of threads (expecting --threads=XX) + if (!((strncmp(arg[iarg],"--kokkos-threads=",17) == 0) || (strncmp(arg[iarg],"--threads=",10) == 0))) + Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[])."); + + char* number = strchr(arg[iarg],'=')+1; + + if(!Impl::is_unsigned_int(number) || (strlen(number)==0)) + Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[])."); + + if((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || !kokkos_threads_found) + num_threads = atoi(number); + + //Remove the --kokkos-threads argument from the list but leave --threads + if(strncmp(arg[iarg],"--kokkos-threads",16) == 0) { + for(int k=iarg;k<narg-1;k++) { + arg[k] = arg[k+1]; + } + kokkos_threads_found=1; + narg--; + } else { + iarg++; + } + } else if ((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || (strncmp(arg[iarg],"--numa",6) == 0)) { + //Find the number of numa (expecting --numa=XX) + if (!((strncmp(arg[iarg],"--kokkos-numa=",14) == 0) || (strncmp(arg[iarg],"--numa=",7) == 0))) + Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[])."); + + char* number = strchr(arg[iarg],'=')+1; + + if(!Impl::is_unsigned_int(number) || (strlen(number)==0)) + Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[])."); + + if((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || !kokkos_numa_found) + numa = atoi(number); + + //Remove the --kokkos-numa argument from the list but leave --numa + if(strncmp(arg[iarg],"--kokkos-numa",13) == 0) { + for(int k=iarg;k<narg-1;k++) { + arg[k] = arg[k+1]; + } + kokkos_numa_found=1; + narg--; + } else { + iarg++; + } + } else if ((strncmp(arg[iarg],"--kokkos-device",15) == 0) || (strncmp(arg[iarg],"--device",8) == 0)) { + //Find the number of device (expecting --device=XX) + if (!((strncmp(arg[iarg],"--kokkos-device=",16) == 0) || (strncmp(arg[iarg],"--device=",9) == 0))) + Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[])."); + + char* number = strchr(arg[iarg],'=')+1; + + if(!Impl::is_unsigned_int(number) || (strlen(number)==0)) + Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[])."); + + if((strncmp(arg[iarg],"--kokkos-device",15) == 0) || !kokkos_device_found) + device = atoi(number); + + //Remove the --kokkos-device argument from the list but leave --device + if(strncmp(arg[iarg],"--kokkos-device",15) == 0) { + for(int k=iarg;k<narg-1;k++) { + arg[k] = arg[k+1]; + } + kokkos_device_found=1; + narg--; + } else { + iarg++; + } + } else if ((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || (strncmp(arg[iarg],"--ndevices",10) == 0)) { + + //Find the number of device (expecting --device=XX) + if (!((strncmp(arg[iarg],"--kokkos-ndevices=",18) == 0) || (strncmp(arg[iarg],"--ndevices=",11) == 0))) + Impl::throw_runtime_exception("Error: expecting an '=INT[,INT]' after command line argument '--ndevices/--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[])."); + + int ndevices=-1; + int skip_device = 9999; + + char* num1 = strchr(arg[iarg],'=')+1; + char* num2 = strpbrk(num1,","); + int num1_len = num2==NULL?strlen(num1):num2-num1; + char* num1_only = new char[num1_len+1]; + strncpy(num1_only,num1,num1_len); + num1_only[num1_len]=0; + + if(!Impl::is_unsigned_int(num1_only) || (strlen(num1_only)==0)) { + Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[])."); + } + if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found) + ndevices = atoi(num1_only); + + if( num2 != NULL ) { + if(( !Impl::is_unsigned_int(num2+1) ) || (strlen(num2)==1) ) + Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices=XX,'. Raised by Kokkos::initialize(int narg, char* argc[])."); + + if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found) + skip_device = atoi(num2+1); + } + + if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found) { + char *str; + if ((str = getenv("SLURM_LOCALID"))) { + int local_rank = atoi(str); + device = local_rank % ndevices; + if (device >= skip_device) device++; + } + if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) { + int local_rank = atoi(str); + device = local_rank % ndevices; + if (device >= skip_device) device++; + } + if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) { + int local_rank = atoi(str); + device = local_rank % ndevices; + if (device >= skip_device) device++; + } + if(device==-1) { + device = 0; + if (device >= skip_device) device++; + } + } + + //Remove the --kokkos-ndevices argument from the list but leave --ndevices + if(strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) { + for(int k=iarg;k<narg-1;k++) { + arg[k] = arg[k+1]; + } + kokkos_ndevices_found=1; + narg--; + } else { + iarg++; + } + } else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) { + std::cout << std::endl; + std::cout << "--------------------------------------------------------------------------------" << std::endl; + std::cout << "-------------Kokkos command line arguments--------------------------------------" << std::endl; + std::cout << "--------------------------------------------------------------------------------" << std::endl; + std::cout << "The following arguments exist also without prefix 'kokkos' (e.g. --help)." << std::endl; + std::cout << "The prefixed arguments will be removed from the list by Kokkos::initialize()," << std::endl; + std::cout << "the non-prefixed ones are not removed. Prefixed versions take precedence over " << std::endl; + std::cout << "non prefixed ones, and the last occurence of an argument overwrites prior" << std::endl; + std::cout << "settings." << std::endl; + std::cout << std::endl; + std::cout << "--kokkos-help : print this message" << std::endl; + std::cout << "--kokkos-threads=INT : specify total number of threads or" << std::endl; + std::cout << " number of threads per NUMA region if " << std::endl; + std::cout << " used in conjunction with '--numa' option. " << std::endl; + std::cout << "--kokkos-numa=INT : specify number of NUMA regions used by process." << std::endl; + std::cout << "--kokkos-device=INT : specify device id to be used by Kokkos. " << std::endl; + std::cout << "--kokkos-ndevices=INT[,INT] : used when running MPI jobs. Specify number of" << std::endl; + std::cout << " devices per node to be used. Process to device" << std::endl; + std::cout << " mapping happens by obtaining the local MPI rank" << std::endl; + std::cout << " and assigning devices round-robin. The optional" << std::endl; + std::cout << " second argument allows for an existing device" << std::endl; + std::cout << " to be ignored. This is most useful on workstations" << std::endl; + std::cout << " with multiple GPUs of which one is used to drive" << std::endl; + std::cout << " screen output." << std::endl; + std::cout << std::endl; + std::cout << "--------------------------------------------------------------------------------" << std::endl; + std::cout << std::endl; + + //Remove the --kokkos-help argument from the list but leave --ndevices + if(strcmp(arg[iarg],"--kokkos-help") == 0) { + for(int k=iarg;k<narg-1;k++) { + arg[k] = arg[k+1]; + } + narg--; + } else { + iarg++; + } + } else + iarg++; + } + + InitArguments arguments; + arguments.num_threads = num_threads; + arguments.num_numa = numa; + arguments.device_id = device; + Impl::initialize_internal(arguments); +} + +void initialize(const InitArguments& arguments) { + Impl::initialize_internal(arguments); +} + +void finalize() +{ + Impl::finalize_internal(); +} + +void finalize_all() +{ + enum { all_spaces = true }; + Impl::finalize_internal( all_spaces ); +} + +void fence() +{ + Impl::fence_internal(); +} + +} // namespace Kokkos + diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.cpp b/lib/kokkos/core/src/impl/Kokkos_Error.cpp new file mode 100755 index 0000000000000000000000000000000000000000..97cfbfae7e82422f6795fd0228ccb993580afb89 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp @@ -0,0 +1,193 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#include <ostream> +#include <sstream> +#include <iomanip> +#include <stdexcept> +#include <impl/Kokkos_Error.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +void host_abort( const char * const message ) +{ + fwrite(message,1,strlen(message),stderr); + fflush(stderr); + abort(); +} + +void throw_runtime_exception( const std::string & msg ) +{ + std::ostringstream o ; + o << msg ; + traceback_callstack( o ); + throw std::runtime_error( o.str() ); +} + + +std::string human_memory_size(size_t arg_bytes) +{ + double bytes = arg_bytes; + const double K = 1024; + const double M = K*1024; + const double G = M*1024; + + std::ostringstream out; + if (bytes < K) { + out << std::setprecision(4) << bytes << " B"; + } else if (bytes < M) { + bytes /= K; + out << std::setprecision(4) << bytes << " K"; + } else if (bytes < G) { + bytes /= M; + out << std::setprecision(4) << bytes << " M"; + } else { + bytes /= G; + out << std::setprecision(4) << bytes << " G"; + } + return out.str(); +} + +} +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#if defined( __GNUC__ ) && defined( ENABLE_TRACEBACK ) + +/* This is only known to work with GNU C++ + * Must be compiled with '-rdynamic' + * Must be linked with '-ldl' + */ + +/* Print call stack into an error stream, + * so one knows in which function the error occured. + * + * Code copied from: + * http://stupefydeveloper.blogspot.com/2008/10/cc-call-stack.html + * + * License on this site: + * This blog is licensed under a + * Creative Commons Attribution-Share Alike 3.0 Unported License. + * + * http://creativecommons.org/licenses/by-sa/3.0/ + * + * Modified to output to std::ostream. + */ +#include <signal.h> +#include <execinfo.h> +#include <cxxabi.h> +#include <dlfcn.h> +#include <stdlib.h> + +namespace Kokkos { +namespace Impl { + +void traceback_callstack( std::ostream & msg ) +{ + using namespace abi; + + enum { MAX_DEPTH = 32 }; + + void *trace[MAX_DEPTH]; + Dl_info dlinfo; + + int status; + + int trace_size = backtrace(trace, MAX_DEPTH); + + msg << std::endl << "Call stack {" << std::endl ; + + for (int i=1; i<trace_size; ++i) + { + if(!dladdr(trace[i], &dlinfo)) + continue; + + const char * symname = dlinfo.dli_sname; + + char * demangled = __cxa_demangle(symname, NULL, 0, &status); + + if ( status == 0 && demangled ) { + symname = demangled; + } + + if ( symname && *symname != 0 ) { + msg << " object: " << dlinfo.dli_fname + << " function: " << symname + << std::endl ; + } + + if ( demangled ) { + free(demangled); + } + } + msg << "}" ; +} + +} +} + +#else + +namespace Kokkos { +namespace Impl { + +void traceback_callstack( std::ostream & msg ) +{ + msg << std::endl << "Traceback functionality not available" << std::endl ; +} + +} +} + +#endif + diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.hpp b/lib/kokkos/core/src/impl/Kokkos_Error.hpp new file mode 100755 index 0000000000000000000000000000000000000000..33e203c948b23cc511205f529d6114d88f31307e --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp @@ -0,0 +1,78 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_ERROR_HPP +#define KOKKOS_IMPL_ERROR_HPP + +#include <string> +#include <iosfwd> + +namespace Kokkos { +namespace Impl { + +void host_abort( const char * const ); + +void throw_runtime_exception( const std::string & ); + +void traceback_callstack( std::ostream & ); + +std::string human_memory_size(size_t arg_bytes); + +} +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) +namespace Kokkos { +inline +void abort( const char * const message ) { Kokkos::Impl::host_abort(message); } +} +#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp new file mode 100755 index 0000000000000000000000000000000000000000..ff6230b57c8abbd778059e55aa2a019d6bee70e2 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp @@ -0,0 +1,1070 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_FUNCTORADAPTER_HPP +#define KOKKOS_FUNCTORADAPTER_HPP + +#include <cstddef> +#include <Kokkos_Core_fwd.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Tags.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ArgTag , class Enable = void > +struct FunctorDeclaresValueType : public Impl::false_type {}; + +template< class FunctorType , class ArgTag > +struct FunctorDeclaresValueType< FunctorType , ArgTag + , typename Impl::enable_if_type< typename FunctorType::value_type >::type > + : public Impl::true_type {}; + + +/** \brief Query Functor and execution policy argument tag for value type. + * + * If C++11 enabled and 'value_type' is not explicitly declared then attempt + * to deduce the type from FunctorType::operator(). + */ +template< class FunctorType , class ArgTag , bool Dec = FunctorDeclaresValueType<FunctorType,ArgTag>::value > +struct FunctorValueTraits +{ + typedef void value_type ; + typedef void pointer_type ; + typedef void reference_type ; + + enum { StaticValueSize = 0 }; + + KOKKOS_FORCEINLINE_FUNCTION static + unsigned value_count( const FunctorType & ) { return 0 ; } + + KOKKOS_FORCEINLINE_FUNCTION static + unsigned value_size( const FunctorType & ) { return 0 ; } +}; + +template<class ArgTag> +struct FunctorValueTraits<void, ArgTag,false> +{ + typedef void reference_type; +}; + +/** \brief FunctorType::value_type is explicitly declared so use it. + * + * Two options for declaration + * + * 1) A plain-old-data (POD) type + * typedef {pod_type} value_type ; + * + * 2) An array of POD of a runtime specified count. + * typedef {pod_type} value_type[] ; + * const unsigned value_count ; + */ +template< class FunctorType , class ArgTag > +struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType::value_type */ > +{ + typedef typename Impl::remove_extent< typename FunctorType::value_type >::type value_type ; + + // If not an array then what is the sizeof(value_type) + enum { StaticValueSize = Impl::is_array< typename FunctorType::value_type >::value ? 0 : sizeof(value_type) }; + + typedef value_type * pointer_type ; + + // The reference_type for an array is 'value_type *' + // The reference_type for a single value is 'value_type &' + + typedef typename Impl::if_c< ! StaticValueSize , value_type * + , value_type & >::type reference_type ; + + // Number of values if single value + template< class F > + KOKKOS_FORCEINLINE_FUNCTION static + typename Impl::enable_if< Impl::is_same<F,FunctorType>::value && StaticValueSize , unsigned >::type + value_count( const F & ) { return 1 ; } + + // Number of values if an array, protect via templating because 'f.value_count' + // will only exist when the functor declares the value_type to be an array. + template< class F > + KOKKOS_FORCEINLINE_FUNCTION static + typename Impl::enable_if< Impl::is_same<F,FunctorType>::value && ! StaticValueSize , unsigned >::type + value_count( const F & f ) { return f.value_count ; } + + // Total size of the value + KOKKOS_INLINE_FUNCTION static + unsigned value_size( const FunctorType & f ) { return value_count( f ) * sizeof(value_type) ; } +}; + + +#if defined( KOKKOS_HAVE_CXX11 ) + +template< class FunctorType , class ArgTag > +struct FunctorValueTraits< FunctorType + , ArgTag + , false /* == exists FunctorType::value_type */ + > +{ +private: + + struct VOIDTAG {}; // Allow declaration of non-matching operator() with void argument tag. + struct REJECTTAG {}; // Reject tagged operator() when using non-tagged execution policy. + + typedef typename + Impl::if_c< Impl::is_same< ArgTag , void >::value , VOIDTAG , ArgTag >::type tag_type ; + + //---------------------------------------- + // parallel_for operator without a tag: + + template< class ArgMember > + KOKKOS_INLINE_FUNCTION + static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember ) const ) {} + + template< class ArgMember > + KOKKOS_INLINE_FUNCTION + static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & ) const ) {} + + template< class TagType , class ArgMember > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember ) const ) {} + + template< class TagType , class ArgMember > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & ) const ) {} + + template< class TagType , class ArgMember > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember ) const ) {} + + template< class TagType , class ArgMember > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & ) const ) {} + + //---------------------------------------- + // parallel_for operator with a tag: + + template< class ArgMember > + KOKKOS_INLINE_FUNCTION + static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember ) const ) {} + + template< class ArgMember > + KOKKOS_INLINE_FUNCTION + static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember ) const ) {} + + template< class ArgMember > + KOKKOS_INLINE_FUNCTION + static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & ) const ) {} + + template< class ArgMember > + KOKKOS_INLINE_FUNCTION + static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & ) const ) {} + + //---------------------------------------- + // parallel_reduce operator without a tag: + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , T & ) const ) {} + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , T & ) const ) {} + + template< class TagType , class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , T & ) const ) {} + + template< class TagType , class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , T & ) const ) {} + + template< class TagType , class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , T & ) const ) {} + + template< class TagType , class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , T & ) const ) {} + + //---------------------------------------- + // parallel_reduce operator with a tag: + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , T & ) const ) {} + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , T & ) const ) {} + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , T & ) const ) {} + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , T & ) const ) {} + + //---------------------------------------- + // parallel_scan operator without a tag: + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , T & , bool ) const ) {} + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , T & , bool ) const ) {} + + template< class TagType , class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , T & , bool ) const ) {} + + template< class TagType , class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , T & , bool ) const ) {} + + template< class TagType , class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , T & , bool ) const ) {} + + template< class TagType , class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , T & , bool ) const ) {} + + //---------------------------------------- + // parallel_scan operator with a tag: + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , T & , bool ) const ) {} + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , T & , bool ) const ) {} + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember& , T & , bool ) const ) {} + + template< class ArgMember , class T > + KOKKOS_INLINE_FUNCTION + static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember& , T & , bool ) const ) {} + + //---------------------------------------- + + typedef decltype( deduce_reduce_type( tag_type() , & FunctorType::operator() ) ) ValueType ; + + enum { IS_VOID = Impl::is_same<VOIDTAG ,ValueType>::value }; + enum { IS_REJECT = Impl::is_same<REJECTTAG,ValueType>::value }; + +public: + + typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType >::type value_type ; + typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType * >::type pointer_type ; + typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType & >::type reference_type ; + + enum { StaticValueSize = IS_VOID || IS_REJECT ? 0 : sizeof(ValueType) }; + + KOKKOS_FORCEINLINE_FUNCTION static + unsigned value_size( const FunctorType & ) { return StaticValueSize ; } + + KOKKOS_FORCEINLINE_FUNCTION static + unsigned value_count( const FunctorType & ) { return IS_VOID || IS_REJECT ? 0 : 1 ; } +}; + +#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */ + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// Function signatures for FunctorType::init function with a tag and not an array +template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize > +struct FunctorValueInitFunction { + + typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type & ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type & ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type & ) ); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type volatile & ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type volatile & ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type volatile & ) ); +}; + +// Function signatures for FunctorType::init function with a tag and is an array +template< class FunctorType , class ArgTag > +struct FunctorValueInitFunction< FunctorType , ArgTag , true > { + + typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type * ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type * ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type * ) ); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type volatile * ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type volatile * ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type volatile * ) ); +}; + +// Function signatures for FunctorType::init function without a tag and not an array +template< class FunctorType > +struct FunctorValueInitFunction< FunctorType , void , false > { + + typedef typename FunctorValueTraits<FunctorType,void>::reference_type value_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( value_type & ) ); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type volatile & ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( value_type volatile & ) ); +}; + +// Function signatures for FunctorType::init function without a tag and is an array +template< class FunctorType > +struct FunctorValueInitFunction< FunctorType , void , true > { + + typedef typename FunctorValueTraits<FunctorType,void>::reference_type value_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( value_type * ) ); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type volatile * ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( value_type volatile * ) ); +}; + +// Adapter for value initialization function. +// If a proper FunctorType::init is declared then use it, +// otherwise use default constructor. +template< class FunctorType , class ArgTag + , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type + , class Enable = void > +struct FunctorValueInit ; + +/* No 'init' function provided for single value */ +template< class FunctorType , class ArgTag , class T , class Enable > +struct FunctorValueInit< FunctorType , ArgTag , T & , Enable > +{ + KOKKOS_FORCEINLINE_FUNCTION static + T & init( const FunctorType & f , void * p ) + { return *( new(p) T() ); }; +}; + +/* No 'init' function provided for array value */ +template< class FunctorType , class ArgTag , class T , class Enable > +struct FunctorValueInit< FunctorType , ArgTag , T * , Enable > +{ + KOKKOS_FORCEINLINE_FUNCTION static + T * init( const FunctorType & f , void * p ) + { + const int n = FunctorValueTraits< FunctorType , ArgTag >::value_count(f); + for ( int i = 0 ; i < n ; ++i ) { new( ((T*)p) + i ) T(); } + return (T*)p ; + } +}; + +/* 'init' function provided for single value */ +template< class FunctorType , class T > +struct FunctorValueInit + < FunctorType + , void + , T & + // First substitution failure when FunctorType::init does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when FunctorType::init is not compatible. + , decltype( FunctorValueInitFunction< FunctorType , void >::enable_if( & FunctorType::init ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + T & init( const FunctorType & f , void * p ) + { f.init( *((T*)p) ); return *((T*)p) ; } +}; + +/* 'init' function provided for array value */ +template< class FunctorType , class T > +struct FunctorValueInit + < FunctorType + , void + , T * + // First substitution failure when FunctorType::init does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when FunctorType::init is not compatible + , decltype( FunctorValueInitFunction< FunctorType , void >::enable_if( & FunctorType::init ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + T * init( const FunctorType & f , void * p ) + { f.init( (T*)p ); return (T*)p ; } +}; + +/* 'init' function provided for single value */ +template< class FunctorType , class ArgTag , class T > +struct FunctorValueInit + < FunctorType + , ArgTag + , T & + // First substitution failure when FunctorType::init does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when FunctorType::init is not compatible. + , decltype( FunctorValueInitFunction< FunctorType , ArgTag >::enable_if( & FunctorType::init ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + T & init( const FunctorType & f , void * p ) + { f.init( ArgTag() , *((T*)p) ); return *((T*)p) ; } +}; + +/* 'init' function provided for array value */ +template< class FunctorType , class ArgTag , class T > +struct FunctorValueInit + < FunctorType + , ArgTag + , T * + // First substitution failure when FunctorType::init does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when FunctorType::init is not compatible + , decltype( FunctorValueInitFunction< FunctorType , ArgTag >::enable_if( & FunctorType::init ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + T * init( const FunctorType & f , void * p ) + { f.init( ArgTag() , (T*)p ); return (T*)p ; } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// Signatures for compatible FunctorType::join with tag and not an array +template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize > +struct FunctorValueJoinFunction { + + typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ; + + typedef volatile value_type & vref_type ; + typedef const volatile value_type & cvref_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , vref_type , cvref_type ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , vref_type , cvref_type ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , vref_type , cvref_type ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , vref_type , cvref_type ) ); +}; + +// Signatures for compatible FunctorType::join with tag and is an array +template< class FunctorType , class ArgTag > +struct FunctorValueJoinFunction< FunctorType , ArgTag , true > { + + typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ; + + typedef volatile value_type * vptr_type ; + typedef const volatile value_type * cvptr_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , vptr_type , cvptr_type ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , vptr_type , cvptr_type ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , vptr_type , cvptr_type ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , vptr_type , cvptr_type ) ); +}; + +// Signatures for compatible FunctorType::join without tag and not an array +template< class FunctorType > +struct FunctorValueJoinFunction< FunctorType , void , false > { + + typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ; + + typedef volatile value_type & vref_type ; + typedef const volatile value_type & cvref_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( vref_type , cvref_type ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( vref_type , cvref_type ) ); +}; + +// Signatures for compatible FunctorType::join without tag and is an array +template< class FunctorType > +struct FunctorValueJoinFunction< FunctorType , void , true > { + + typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ; + + typedef volatile value_type * vptr_type ; + typedef const volatile value_type * cvptr_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( vptr_type , cvptr_type ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( vptr_type , cvptr_type ) ); +}; + + +template< class FunctorType , class ArgTag + , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type + , class Enable = void > +struct FunctorValueJoin ; + +/* No 'join' function provided, single value */ +template< class FunctorType , class ArgTag , class T , class Enable > +struct FunctorValueJoin< FunctorType , ArgTag , T & , Enable > +{ + KOKKOS_FORCEINLINE_FUNCTION static + void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) + { + *((volatile T*)lhs) += *((const volatile T*)rhs); + } +}; + +/* No 'join' function provided, array of values */ +template< class FunctorType , class ArgTag , class T , class Enable > +struct FunctorValueJoin< FunctorType , ArgTag , T * , Enable > +{ + KOKKOS_FORCEINLINE_FUNCTION static + void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) + { + const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f); + + for ( int i = 0 ; i < n ; ++i ) { ((volatile T*)lhs)[i] += ((const volatile T*)rhs)[i]; } + } +}; + +/* 'join' function provided, single value */ +template< class FunctorType , class ArgTag , class T > +struct FunctorValueJoin + < FunctorType + , ArgTag + , T & + // First substitution failure when FunctorType::join does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when enable_if( & Functor::join ) does not exist + , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) + { + f.join( ArgTag() , *((volatile T *)lhs) , *((const volatile T *)rhs) ); + } +}; + +/* 'join' function provided, no tag, single value */ +template< class FunctorType , class T > +struct FunctorValueJoin + < FunctorType + , void + , T & + // First substitution failure when FunctorType::join does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when enable_if( & Functor::join ) does not exist + , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) + { + f.join( *((volatile T *)lhs) , *((const volatile T *)rhs) ); + } +}; + +/* 'join' function provided for array value */ +template< class FunctorType , class ArgTag , class T > +struct FunctorValueJoin + < FunctorType + , ArgTag + , T * + // First substitution failure when FunctorType::join does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when enable_if( & Functor::join ) does not exist + , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) + { + f.join( ArgTag() , (volatile T *)lhs , (const volatile T *)rhs ); + } +}; + +/* 'join' function provided, no tag, array value */ +template< class FunctorType , class T > +struct FunctorValueJoin + < FunctorType + , void + , T * + // First substitution failure when FunctorType::join does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when enable_if( & Functor::join ) does not exist + , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs ) + { + f.join( (volatile T *)lhs , (const volatile T *)rhs ); + } +}; + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { + +namespace Impl { + +#if defined( KOKKOS_HAVE_CXX11 ) + + template<typename ValueType, class JoinOp, class Enable = void> + struct JoinLambdaAdapter { + typedef ValueType value_type; + const JoinOp& lambda; + KOKKOS_INLINE_FUNCTION + JoinLambdaAdapter(const JoinOp& lambda_):lambda(lambda_) {} + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, const volatile value_type& src) const { + lambda(dst,src); + } + + KOKKOS_INLINE_FUNCTION + void join(value_type& dst, const value_type& src) const { + lambda(dst,src); + } + + KOKKOS_INLINE_FUNCTION + void operator() (volatile value_type& dst, const volatile value_type& src) const { + lambda(dst,src); + } + + KOKKOS_INLINE_FUNCTION + void operator() (value_type& dst, const value_type& src) const { + lambda(dst,src); + } + }; + + template<typename ValueType, class JoinOp> + struct JoinLambdaAdapter<ValueType, JoinOp, decltype( FunctorValueJoinFunction< JoinOp , void >::enable_if( & JoinOp::join ) )> { + typedef ValueType value_type; + typedef StaticAssertSame<ValueType,typename JoinOp::value_type> assert_value_types_match; + const JoinOp& lambda; + KOKKOS_INLINE_FUNCTION + JoinLambdaAdapter(const JoinOp& lambda_):lambda(lambda_) {} + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, const volatile value_type& src) const { + lambda.join(dst,src); + } + + KOKKOS_INLINE_FUNCTION + void join(value_type& dst, const value_type& src) const { + lambda.join(dst,src); + } + + KOKKOS_INLINE_FUNCTION + void operator() (volatile value_type& dst, const volatile value_type& src) const { + lambda.join(dst,src); + } + + KOKKOS_INLINE_FUNCTION + void operator() (value_type& dst, const value_type& src) const { + lambda.join(dst,src); + } + }; + +#endif + + template<typename ValueType> + struct JoinAdd { + typedef ValueType value_type; + + KOKKOS_INLINE_FUNCTION + JoinAdd() {} + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, const volatile value_type& src) const { + dst+=src; + } + KOKKOS_INLINE_FUNCTION + void operator() (value_type& dst, const value_type& src) const { + dst+=src; + } + KOKKOS_INLINE_FUNCTION + void operator() (volatile value_type& dst, const volatile value_type& src) const { + dst+=src; + } + }; + +} +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ArgTag + , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type > +struct FunctorValueOps ; + +template< class FunctorType , class ArgTag , class T > +struct FunctorValueOps< FunctorType , ArgTag , T & > +{ + KOKKOS_FORCEINLINE_FUNCTION static + T * pointer( T & r ) { return & r ; } + + KOKKOS_FORCEINLINE_FUNCTION static + T & reference( void * p ) { return *((T*)p); } + + KOKKOS_FORCEINLINE_FUNCTION static + void copy( const FunctorType & , void * const lhs , const void * const rhs ) + { *((T*)lhs) = *((const T*)rhs); } +}; + +/* No 'join' function provided, array of values */ +template< class FunctorType , class ArgTag , class T > +struct FunctorValueOps< FunctorType , ArgTag , T * > +{ + KOKKOS_FORCEINLINE_FUNCTION static + T * pointer( T * p ) { return p ; } + + KOKKOS_FORCEINLINE_FUNCTION static + T * reference( void * p ) { return ((T*)p); } + + KOKKOS_FORCEINLINE_FUNCTION static + void copy( const FunctorType & f , void * const lhs , const void * const rhs ) + { + const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f); + for ( int i = 0 ; i < n ; ++i ) { ((T*)lhs)[i] = ((const T*)rhs)[i]; } + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// Compatible functions for 'final' function and value_type not an array +template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize > +struct FunctorFinalFunction { + + typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type & ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type & ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type & ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type & ) ); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type volatile & ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type volatile & ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type volatile & ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type volatile & ) ); + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type const & ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const & ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type const & ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const & ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type const & ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type const & ) ); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type const volatile & ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile & ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type const volatile & ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile & ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type const volatile & ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type const volatile & ) ); +}; + +// Compatible functions for 'final' function and value_type is an array +template< class FunctorType , class ArgTag > +struct FunctorFinalFunction< FunctorType , ArgTag , true > { + + typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type * ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type * ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type * ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type * ) ); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type volatile * ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type volatile * ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type volatile * ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type volatile * ) ); + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type const * ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const * ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type const * ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const * ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type const * ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type const * ) ); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type const volatile * ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile * ) const ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , value_type const volatile * ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile * ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , value_type const volatile * ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , value_type const volatile * ) ); +}; + +template< class FunctorType > +struct FunctorFinalFunction< FunctorType , void , false > { + + typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( value_type & ) ); + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type & ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type & ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( const value_type & ) ); +}; + +template< class FunctorType > +struct FunctorFinalFunction< FunctorType , void , true > { + + typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ; + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( value_type * ) ); + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type * ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type * ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( const value_type * ) ); +}; + +/* No 'final' function provided */ +template< class FunctorType , class ArgTag + , class ResultType = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type + , class Enable = void > +struct FunctorFinal +{ + KOKKOS_FORCEINLINE_FUNCTION static + void final( const FunctorType & , void * ) {} +}; + +/* 'final' function provided */ +template< class FunctorType , class ArgTag , class T > +struct FunctorFinal + < FunctorType + , ArgTag + , T & + // First substitution failure when FunctorType::final does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when enable_if( & Functor::final ) does not exist + , decltype( FunctorFinalFunction< FunctorType , ArgTag >::enable_if( & FunctorType::final ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + void final( const FunctorType & f , void * p ) { f.final( *((T*)p) ); } + + KOKKOS_FORCEINLINE_FUNCTION static + void final( FunctorType & f , void * p ) { f.final( *((T*)p) ); } +}; + +/* 'final' function provided for array value */ +template< class FunctorType , class ArgTag , class T > +struct FunctorFinal + < FunctorType + , ArgTag + , T * + // First substitution failure when FunctorType::final does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when enable_if( & Functor::final ) does not exist + , decltype( FunctorFinalFunction< FunctorType , ArgTag >::enable_if( & FunctorType::final ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + void final( const FunctorType & f , void * p ) { f.final( (T*)p ); } + + KOKKOS_FORCEINLINE_FUNCTION static + void final( FunctorType & f , void * p ) { f.final( (T*)p ); } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class FunctorType , class ArgTag + , class ReferenceType = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type > +struct FunctorApplyFunction { + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , ReferenceType ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , ReferenceType ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag , ReferenceType ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , ReferenceType ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag , ReferenceType ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ArgTag const & , ReferenceType ) ); +}; + +template< class FunctorType , class ReferenceType > +struct FunctorApplyFunction< FunctorType , void , ReferenceType > { + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ReferenceType ) const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ReferenceType ) ); + KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( ReferenceType ) ); +}; + +template< class FunctorType > +struct FunctorApplyFunction< FunctorType , void , void > { + + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)() const ); + KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)() ); +}; + +template< class FunctorType , class ArgTag , class ReferenceType + , class Enable = void > +struct FunctorApply +{ + KOKKOS_FORCEINLINE_FUNCTION static + void apply( const FunctorType & , void * ) {} +}; + +/* 'apply' function provided for void value */ +template< class FunctorType , class ArgTag > +struct FunctorApply + < FunctorType + , ArgTag + , void + // First substitution failure when FunctorType::apply does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when enable_if( & Functor::apply ) does not exist + , decltype( FunctorApplyFunction< FunctorType , ArgTag , void >::enable_if( & FunctorType::apply ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::apply ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + void apply( FunctorType & f ) { f.apply(); } + + KOKKOS_FORCEINLINE_FUNCTION static + void apply( const FunctorType & f ) { f.apply(); } +}; + +/* 'apply' function provided for single value */ +template< class FunctorType , class ArgTag , class T > +struct FunctorApply + < FunctorType + , ArgTag + , T & + // First substitution failure when FunctorType::apply does not exist. +#if defined( KOKKOS_HAVE_CXX11 ) + // Second substitution failure when enable_if( & Functor::apply ) does not exist + , decltype( FunctorApplyFunction< FunctorType , ArgTag >::enable_if( & FunctorType::apply ) ) +#else + , typename Impl::enable_if< 0 < sizeof( & FunctorType::apply ) >::type +#endif + > +{ + KOKKOS_FORCEINLINE_FUNCTION static + void apply( const FunctorType & f , void * p ) { f.apply( *((T*)p) ); } + + KOKKOS_FORCEINLINE_FUNCTION static + void apply( FunctorType & f , void * p ) { f.apply( *((T*)p) ); } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* KOKKOS_FUNCTORADAPTER_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp new file mode 100755 index 0000000000000000000000000000000000000000..5c6a5b03b1ca07d6d1b6ba73f07e05a74f71f675 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp @@ -0,0 +1,455 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#include <Kokkos_Macros.hpp> + +/*--------------------------------------------------------------------------*/ + +#if defined( __INTEL_COMPILER ) && ! defined ( KOKKOS_HAVE_CUDA ) + +// Intel specialized allocator does not interoperate with CUDA memory allocation + +#define KOKKOS_INTEL_MM_ALLOC_AVAILABLE + +#endif + +/*--------------------------------------------------------------------------*/ + +#if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \ + ( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 ) + +#define KOKKOS_POSIX_MEMALIGN_AVAILABLE + +#include <unistd.h> +#include <sys/mman.h> + +/* mmap flags for private anonymous memory allocation */ + +#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE ) + #define KOKKOS_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS) +#elif defined( MAP_ANON ) && defined( MAP_PRIVATE ) + #define KOKKOS_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANON) +#endif + +// mmap flags for huge page tables +#if defined( KOKKOS_POSIX_MMAP_FLAGS ) + #if defined( MAP_HUGETLB ) + #define KOKKOS_POSIX_MMAP_FLAGS_HUGE (KOKKOS_POSIX_MMAP_FLAGS | MAP_HUGETLB ) + #else + #define KOKKOS_POSIX_MMAP_FLAGS_HUGE KOKKOS_POSIX_MMAP_FLAGS + #endif +#endif + +#endif + +/*--------------------------------------------------------------------------*/ + +#include <stddef.h> +#include <stdlib.h> +#include <stdint.h> +#include <memory.h> + +#include <iostream> +#include <sstream> +#include <cstring> + +#include <Kokkos_HostSpace.hpp> +#include <impl/Kokkos_BasicAllocators.hpp> +#include <impl/Kokkos_Error.hpp> +#include <Kokkos_Atomic.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + + +DeepCopy<HostSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n ) +{ + memcpy( dst , src , n ); +} + +} +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace { + +static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ; + +typedef int (* QuerySpaceInParallelPtr )(); + +QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ; +int s_in_parallel_query_count = 0 ; + +} // namespace <empty> + +void HostSpace::register_in_parallel( int (*device_in_parallel)() ) +{ + if ( 0 == device_in_parallel ) { + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) ); + } + + int i = -1 ; + + if ( ! (device_in_parallel)() ) { + for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i ); + } + + if ( i < s_in_parallel_query_count ) { + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) ); + + } + + if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) { + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) ); + + } + + for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i ); + + if ( i == s_in_parallel_query_count ) { + s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ; + } +} + +int HostSpace::in_parallel() +{ + const int n = s_in_parallel_query_count ; + + int i = 0 ; + + while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; } + + return i < n ; +} + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +Impl::AllocationTracker HostSpace::allocate_and_track( const std::string & label, const size_t size ) +{ + return Impl::AllocationTracker( allocator(), size, label ); +} + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/* Default allocation mechanism */ +HostSpace::HostSpace() + : m_alloc_mech( +#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE ) + HostSpace::INTEL_MM_ALLOC +#elif defined( KOKKOS_POSIX_MMAP_FLAGS ) + HostSpace::POSIX_MMAP +#elif defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE ) + HostSpace::POSIX_MEMALIGN +#else + HostSpace::STD_MALLOC +#endif + ) +{} + +/* Default allocation mechanism */ +HostSpace::HostSpace( const HostSpace::AllocationMechanism & arg_alloc_mech ) + : m_alloc_mech( HostSpace::STD_MALLOC ) +{ + if ( arg_alloc_mech == STD_MALLOC ) { + m_alloc_mech = HostSpace::STD_MALLOC ; + } +#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE ) + else if ( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) { + m_alloc_mech = HostSpace::INTEL_MM_ALLOC ; + } +#elif defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE ) + else if ( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) { + m_alloc_mech = HostSpace::POSIX_MEMALIGN ; + } +#elif defined( KOKKOS_POSIX_MMAP_FLAGS ) + else if ( arg_alloc_mech == HostSpace::POSIX_MMAP ) { + m_alloc_mech = HostSpace::POSIX_MMAP ; + } +#endif + else { + const char * const mech = + ( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) ? "INTEL_MM_ALLOC" : ( + ( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) ? "POSIX_MEMALIGN" : ( + ( arg_alloc_mech == HostSpace::POSIX_MMAP ) ? "POSIX_MMAP" : "" )); + + std::string msg ; + msg.append("Kokkos::HostSpace "); + msg.append(mech); + msg.append(" is not available" ); + Kokkos::Impl::throw_runtime_exception( msg ); + } +} + +void * HostSpace::allocate( const size_t arg_alloc_size ) const +{ + static_assert( sizeof(void*) == sizeof(uintptr_t) + , "Error sizeof(void*) != sizeof(uintptr_t)" ); + + static_assert( Kokkos::Impl::power_of_two< Kokkos::Impl::MEMORY_ALIGNMENT >::value + , "Memory alignment must be power of two" ); + + constexpr size_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT ; + constexpr size_t alignment_mask = alignment - 1 ; + + void * ptr = NULL; + + if ( arg_alloc_size ) { + + if ( m_alloc_mech == STD_MALLOC ) { + // Over-allocate to and round up to guarantee proper alignment. + size_t size_padded = arg_alloc_size + sizeof(void*) + alignment ; + + void * alloc_ptr = malloc( size_padded ); + + if (alloc_ptr) { + uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr); + + // offset enough to record the alloc_ptr + address += sizeof(void *); + uintptr_t rem = address % alignment; + uintptr_t offset = rem ? (alignment - rem) : 0u; + address += offset; + ptr = reinterpret_cast<void *>(address); + // record the alloc'd pointer + address -= sizeof(void *); + *reinterpret_cast<void **>(address) = alloc_ptr; + } + } + +#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE ) + else if ( m_alloc_mech == INTEL_MM_ALLOC ) { + ptr = _mm_malloc( arg_alloc_size , alignment ); + } +#endif + +#if defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE ) + else if ( m_alloc_mech == POSIX_MEMALIGN ) { + posix_memalign( & ptr, alignment , arg_alloc_size ); + } +#endif + +#if defined( KOKKOS_POSIX_MMAP_FLAGS ) + else if ( m_alloc_mech == POSIX_MMAP ) { + constexpr size_t use_huge_pages = (1u << 27); + constexpr int prot = PROT_READ | PROT_WRITE ; + const int flags = arg_alloc_size < use_huge_pages + ? KOKKOS_POSIX_MMAP_FLAGS + : KOKKOS_POSIX_MMAP_FLAGS_HUGE ; + + // read write access to private memory + + ptr = mmap( NULL /* address hint, if NULL OS kernel chooses address */ + , arg_alloc_size /* size in bytes */ + , prot /* memory protection */ + , flags /* visibility of updates */ + , -1 /* file descriptor */ + , 0 /* offset */ + ); + +/* Associated reallocation: + ptr = mremap( old_ptr , old_size , new_size , MREMAP_MAYMOVE ); +*/ + } +#endif + } + + if ( reinterpret_cast<uintptr_t>(ptr) & alignment_mask ) { + Kokkos::Impl::throw_runtime_exception( "Kokkos::HostSpace aligned allocation failed" ); + } + + return ptr; +} + + +void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const +{ + if ( arg_alloc_ptr ) { + + if ( m_alloc_mech == STD_MALLOC ) { + void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1); + free( alloc_ptr ); + } + +#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE ) + else if ( m_alloc_mech == INTEL_MM_ALLOC ) { + _mm_free( arg_alloc_ptr ); + } +#endif + +#if defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE ) + else if ( m_alloc_mech == POSIX_MEMALIGN ) { + free( arg_alloc_ptr ); + } +#endif + +#if defined( KOKKOS_POSIX_MMAP_FLAGS ) + else if ( m_alloc_mech == POSIX_MMAP ) { + munmap( arg_alloc_ptr , arg_alloc_size ); + } +#endif + + } +} + +} // namespace Kokkos + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +SharedAllocationRecord< void , void > +SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record ; + +void +SharedAllocationRecord< Kokkos::HostSpace , void >:: +deallocate( SharedAllocationRecord< void , void > * arg_rec ) +{ + delete static_cast<SharedAllocationRecord*>(arg_rec); +} + +SharedAllocationRecord< Kokkos::HostSpace , void >:: +~SharedAllocationRecord() +{ + m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr + , SharedAllocationRecord< void , void >::m_alloc_size + ); +} + +SharedAllocationRecord< Kokkos::HostSpace , void >:: +SharedAllocationRecord( const Kokkos::HostSpace & arg_space + , const std::string & arg_label + , const size_t arg_alloc_size + , const SharedAllocationRecord< void , void >::function_type arg_dealloc + ) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : SharedAllocationRecord< void , void > + ( & SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record + , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) ) + , sizeof(SharedAllocationHeader) + arg_alloc_size + , arg_dealloc + ) + , m_space( arg_space ) +{ + // Fill in the Header information + RecordBase::m_alloc_ptr->m_record = static_cast< SharedAllocationRecord< void , void > * >( this ); + + strncpy( RecordBase::m_alloc_ptr->m_label + , arg_label.c_str() + , SharedAllocationHeader::maximum_label_length + ); +} + +SharedAllocationRecord< Kokkos::HostSpace , void > * +SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr ) +{ + typedef SharedAllocationHeader Header ; + typedef SharedAllocationRecord< Kokkos::HostSpace , void > RecordHost ; + + SharedAllocationHeader const * const head = Header::get_header( alloc_ptr ); + RecordHost * const record = static_cast< RecordHost * >( head->m_record ); + + if ( record->m_alloc_ptr != head ) { + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void >::get_record ERROR" ) ); + } + + return record ; +} + +// Iterate records to print orphaned memory ... +void SharedAllocationRecord< Kokkos::HostSpace , void >:: +print_records( std::ostream & s , const Kokkos::HostSpace & space , bool detail ) +{ + SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail ); +} + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace { + const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF; + const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39; + static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1]; +} + +namespace Impl { +void init_lock_array_host_space() { + static int is_initialized = 0; + if(! is_initialized) + for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++) + HOST_SPACE_ATOMIC_LOCKS[i] = 0; +} + +bool lock_address_host_space(void* ptr) { + return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[ + (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] , + 0 , 1); +} + +void unlock_address_host_space(void* ptr) { + atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[ + (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] , + 0); +} + +} +} diff --git a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp new file mode 100755 index 0000000000000000000000000000000000000000..17eb0c2f4b4d25f3f738e97465b24c34c39de22d --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp @@ -0,0 +1,73 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE ) +#define KOKKOS_MEMORY_FENCE +namespace Kokkos { + +//---------------------------------------------------------------------------- + +KOKKOS_FORCEINLINE_FUNCTION +void memory_fence() +{ +#if defined( KOKKOS_ATOMICS_USE_CUDA ) + __threadfence(); +#elif defined( KOKKOS_ATOMICS_USE_GCC ) || \ + ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ATOMICS_USE_INTEL ) ) + __sync_synchronize(); +#elif defined( KOKKOS_ATOMICS_USE_INTEL ) + _mm_mfence(); +#elif defined( KOKKOS_ATOMICS_USE_OMP31 ) + #pragma omp flush +#elif defined( KOKKOS_ATOMICS_USE_WINDOWS ) + MemoryBarrier(); +#else + #error "Error: memory_fence() not defined" +#endif +} + +} // namespace kokkos + +#endif + + diff --git a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp new file mode 100755 index 0000000000000000000000000000000000000000..0e87c63e4469e93496074a73f92a98b27b642c61 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp @@ -0,0 +1,84 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP +#define KOKKOS_PHYSICAL_LAYOUT_HPP + + +#include <Kokkos_View.hpp> +namespace Kokkos { +namespace Impl { + + + +struct PhysicalLayout { + enum LayoutType {Left,Right,Scalar,Error}; + LayoutType layout_type; + int rank; + long long int stride[8]; //distance between two neighboring elements in a given dimension + + template< class T , class L , class D , class M > + PhysicalLayout( const View<T,L,D,M,ViewDefault> & view ) + : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft >::value ? Left : ( + is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error )) + , rank( view.Rank ) + { + for(int i=0;i<8;i++) stride[i] = 0; + view.stride( stride ); + } + #ifdef KOKKOS_HAVE_CUDA + template< class T , class L , class D , class M > + PhysicalLayout( const View<T,L,D,M,ViewCudaTexture> & view ) + : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft >::value ? Left : ( + is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error )) + , rank( view.Rank ) + { + for(int i=0;i<8;i++) stride[i] = 0; + view.stride( stride ); + } + #endif +}; + +} +} +#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp new file mode 100755 index 0000000000000000000000000000000000000000..5da60841d4376e45baf5c0733cb23c8449278ba3 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp @@ -0,0 +1,57 @@ +/* + //@HEADER + // ************************************************************************ + // + // Kokkos v. 2.0 + // Copyright (2014) Sandia Corporation + // + // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, + // the U.S. Government retains certain rights in this software. + // + // Redistribution and use in source and binary forms, with or without + // modification, are permitted provided that the following conditions are + // met: + // + // 1. Redistributions of source code must retain the above copyright + // notice, this list of conditions and the following disclaimer. + // + // 2. Redistributions in binary form must reproduce the above copyright + // notice, this list of conditions and the following disclaimer in the + // documentation and/or other materials provided with the distribution. + // + // 3. Neither the name of the Corporation nor the names of the + // contributors may be used to endorse or promote products derived from + // this software without specific prior written permission. + // + // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY + // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE + // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // + // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) + // + // ************************************************************************ + //@HEADER +*/ + +#ifndef KOKKOSP_DEVICE_INFO_HPP +#define KOKKOSP_DEVICE_INFO_HPP + +namespace Kokkos { +namespace Experimental { + + struct KokkosPDeviceInfo { + uint32_t deviceID; + }; + +} +} + +#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp new file mode 100755 index 0000000000000000000000000000000000000000..85ec1709c61ca9c7e4020ea01b1da6e06df6a836 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp @@ -0,0 +1,141 @@ +/* + //@HEADER + // ************************************************************************ + // + // Kokkos v. 2.0 + // Copyright (2014) Sandia Corporation + // + // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, + // the U.S. Government retains certain rights in this software. + // + // Redistribution and use in source and binary forms, with or without + // modification, are permitted provided that the following conditions are + // met: + // + // 1. Redistributions of source code must retain the above copyright + // notice, this list of conditions and the following disclaimer. + // + // 2. Redistributions in binary form must reproduce the above copyright + // notice, this list of conditions and the following disclaimer in the + // documentation and/or other materials provided with the distribution. + // + // 3. Neither the name of the Corporation nor the names of the + // contributors may be used to endorse or promote products derived from + // this software without specific prior written permission. + // + // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY + // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE + // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // + // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) + // + // ************************************************************************ + //@HEADER + */ + +#include <impl/Kokkos_Profiling_Interface.hpp> + +#ifdef KOKKOSP_ENABLE_PROFILING +#include <string.h> + +namespace Kokkos { + namespace Experimental { + bool profileLibraryLoaded() { + return (NULL != initProfileLibrary); + } + + void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) { + if(NULL != beginForCallee) { + Kokkos::fence(); + (*beginForCallee)(kernelPrefix.c_str(), devID, kernelID); + } + }; + + void endParallelFor(const uint64_t kernelID) { + if(NULL != endForCallee) { + Kokkos::fence(); + (*endForCallee)(kernelID); + } + }; + + void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) { + if(NULL != beginScanCallee) { + Kokkos::fence(); + (*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID); + } + }; + + void endParallelScan(const uint64_t kernelID) { + if(NULL != endScanCallee) { + Kokkos::fence(); + (*endScanCallee)(kernelID); + } + }; + + void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) { + if(NULL != beginReduceCallee) { + Kokkos::fence(); + (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID); + } + }; + + void endParallelReduce(const uint64_t kernelID) { + if(NULL != endReduceCallee) { + Kokkos::fence(); + (*endReduceCallee)(kernelID); + } + }; + + void initialize() { + void* firstProfileLibrary; + + char* envProfileLibrary = getenv("KOKKOS_PROFILE_LIBRARY"); + char* profileLibraryName = strtok(envProfileLibrary, ";"); + + if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) { + firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL); + + if(NULL == firstProfileLibrary) { + std::cerr << "Error: Unable to load KokkosP library: " << + profileLibraryName << std::endl; + } else { + std::cout << "KOKKOSP: Library Loaded: " << profileLibraryName << std::endl; + + beginForCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for"); + beginScanCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan"); + beginReduceCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce"); + + endScanCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan"); + endForCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_for"); + endReduceCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce"); + + initProfileLibrary = (initFunction) dlsym(firstProfileLibrary, "kokkosp_init_library"); + finalizeProfileLibrary = (finalizeFunction) dlsym(firstProfileLibrary, "kokkosp_finalize_library"); + } + } + + if(NULL != initProfileLibrary) { + (*initProfileLibrary)(0, + (uint64_t) KOKKOSP_INTERFACE_VERSION, + (uint32_t) 0, + NULL); + } + }; + + void finalize() { + if(NULL != finalizeProfileLibrary) { + (*finalizeProfileLibrary)(); + } + }; + } +} + +#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp new file mode 100755 index 0000000000000000000000000000000000000000..1e2f715f36d9d6275c42ada1bc11291a1c18b628 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -0,0 +1,98 @@ +/* + //@HEADER + // ************************************************************************ + // + // Kokkos v. 2.0 + // Copyright (2014) Sandia Corporation + // + // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, + // the U.S. Government retains certain rights in this software. + // + // Redistribution and use in source and binary forms, with or without + // modification, are permitted provided that the following conditions are + // met: + // + // 1. Redistributions of source code must retain the above copyright + // notice, this list of conditions and the following disclaimer. + // + // 2. Redistributions in binary form must reproduce the above copyright + // notice, this list of conditions and the following disclaimer in the + // documentation and/or other materials provided with the distribution. + // + // 3. Neither the name of the Corporation nor the names of the + // contributors may be used to endorse or promote products derived from + // this software without specific prior written permission. + // + // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY + // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE + // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // + // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) + // + // ************************************************************************ + //@HEADER + */ + +#ifndef KOKKOSP_INTERFACE_HPP +#define KOKKOSP_INTERFACE_HPP + +#include <cstddef> +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Macros.hpp> +#include <string> + +#ifdef KOKKOSP_ENABLE_PROFILING +#include <impl/Kokkos_Profiling_DeviceInfo.hpp> +#include <dlfcn.h> +#include <iostream> +#include <stdlib.h> +#endif + +#define KOKKOSP_INTERFACE_VERSION 20150628 + +#ifdef KOKKOSP_ENABLE_PROFILING +namespace Kokkos { + namespace Experimental { + + typedef void (*initFunction)(const int, + const uint64_t, + const uint32_t, + KokkosPDeviceInfo*); + typedef void (*finalizeFunction)(); + typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*); + typedef void (*endFunction)(uint64_t); + + static initFunction initProfileLibrary = NULL; + static finalizeFunction finalizeProfileLibrary = NULL; + static beginFunction beginForCallee = NULL; + static beginFunction beginScanCallee = NULL; + static beginFunction beginReduceCallee = NULL; + static endFunction endForCallee = NULL; + static endFunction endScanCallee = NULL; + static endFunction endReduceCallee = NULL; + + bool profileLibraryLoaded(); + + void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID); + void endParallelFor(const uint64_t kernelID); + void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID); + void endParallelScan(const uint64_t kernelID); + void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID); + void endParallelReduce(const uint64_t kernelID); + + void initialize(); + void finalize(); + + } +} + +#endif +#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp new file mode 100755 index 0000000000000000000000000000000000000000..562c7afc6de5e3b6913671e52abc5157dc61c6d5 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp @@ -0,0 +1,119 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdlib.h> +#include <sstream> +#include <Kokkos_Serial.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Error.hpp> + +#if defined( KOKKOS_HAVE_SERIAL ) + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +namespace SerialImpl { + +Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {} + +Sentinel::~Sentinel() +{ + if ( m_scratch ) { free( m_scratch ); } + m_scratch = 0 ; + m_reduce_end = 0 ; + m_shared_end = 0 ; +} + +Sentinel & Sentinel::singleton() +{ + static Sentinel s ; return s ; +} + +inline +unsigned align( unsigned n ) +{ + enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 }; + return ( n + MASK ) & ~MASK ; +} + +} // namespace + +SerialTeamMember::SerialTeamMember( int arg_league_rank + , int arg_league_size + , int arg_shared_size + ) + : m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end + , arg_shared_size ) + , m_league_rank( arg_league_rank ) + , m_league_size( arg_league_size ) +{} + +} // namespace Impl + +void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size ) +{ + static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton(); + + reduce_size = Impl::SerialImpl::align( reduce_size ); + shared_size = Impl::SerialImpl::align( shared_size ); + + if ( ( s.m_reduce_end < reduce_size ) || + ( s.m_shared_end < s.m_reduce_end + shared_size ) ) { + + if ( s.m_scratch ) { free( s.m_scratch ); } + + if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ; + if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ; + + s.m_scratch = malloc( s.m_shared_end ); + } + + return s.m_scratch ; +} + +} // namespace Kokkos + +#endif // defined( KOKKOS_HAVE_SERIAL ) + + diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp new file mode 100755 index 0000000000000000000000000000000000000000..688f97f42e2f9cc41e4a1353a58a277edb49c905 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp @@ -0,0 +1,336 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#include <impl/Kokkos_Serial_TaskPolicy.hpp> + +#if defined( KOKKOS_HAVE_SERIAL ) +#include <stdlib.h> +#include <stdexcept> +#include <iostream> +#include <sstream> +#include <string> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +TaskPolicy< Kokkos::Serial >::member_type & +TaskPolicy< Kokkos::Serial >::member_single() +{ + static member_type s(0,1,0); + return s ; +} + +} // namespace Experimental +} // namespace Kokkos + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +typedef TaskMember< Kokkos::Serial , void , void > Task ; + +//---------------------------------------------------------------------------- + +namespace { + +inline +unsigned padded_sizeof_derived( unsigned sizeof_derived ) +{ + return sizeof_derived + + ( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 ); +} + +} // namespace + +void Task::deallocate( void * ptr ) +{ + free( ptr ); +} + +void * Task::allocate( const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity ) +{ + return malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) ); +} + +Task::~TaskMember() +{ + +} + +Task::TaskMember( const Task::function_verify_type arg_verify + , const Task::function_dealloc_type arg_dealloc + , const Task::function_apply_type arg_apply + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + ) + : m_dealloc( arg_dealloc ) + , m_verify( arg_verify ) + , m_apply( arg_apply ) + , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) ) + , m_wait( 0 ) + , m_next( 0 ) + , m_dep_capacity( arg_dependence_capacity ) + , m_dep_size( 0 ) + , m_ref_count( 0 ) + , m_state( TASK_STATE_CONSTRUCTING ) +{ + for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ; +} + +Task::TaskMember( const Task::function_dealloc_type arg_dealloc + , const Task::function_apply_type arg_apply + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + ) + : m_dealloc( arg_dealloc ) + , m_verify( & Task::verify_type<void> ) + , m_apply( arg_apply ) + , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) ) + , m_wait( 0 ) + , m_next( 0 ) + , m_dep_capacity( arg_dependence_capacity ) + , m_dep_size( 0 ) + , m_ref_count( 0 ) + , m_state( TASK_STATE_CONSTRUCTING ) +{ + for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ; +} + +//---------------------------------------------------------------------------- + +void Task::throw_error_add_dependence() const +{ + std::cerr << "TaskMember< Serial >::add_dependence ERROR" + << " state(" << m_state << ")" + << " dep_size(" << m_dep_size << ")" + << std::endl ; + throw std::runtime_error("TaskMember< Serial >::add_dependence ERROR"); +} + +void Task::throw_error_verify_type() +{ + throw std::runtime_error("TaskMember< Serial >::verify_type ERROR"); +} + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + +void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw ) +{ + static const char msg_error_header[] = "Kokkos::Experimental::Impl::TaskManager<Kokkos::Serial>::assign ERROR" ; + static const char msg_error_count[] = ": negative reference count" ; + static const char msg_error_complete[] = ": destroy task that is not complete" ; + static const char msg_error_dependences[] = ": destroy task that has dependences" ; + static const char msg_error_exception[] = ": caught internal exception" ; + + const char * msg_error = 0 ; + + try { + + if ( *lhs ) { + + const int count = --((**lhs).m_ref_count); + + if ( 0 == count ) { + + // Reference count at zero, delete it + + // Should only be deallocating a completed task + if ( (**lhs).m_state == Kokkos::Experimental::TASK_STATE_COMPLETE ) { + + // A completed task should not have dependences... + for ( int i = 0 ; i < (**lhs).m_dep_size && 0 == msg_error ; ++i ) { + if ( (**lhs).m_dep[i] ) msg_error = msg_error_dependences ; + } + } + else { + msg_error = msg_error_complete ; + } + + if ( 0 == msg_error ) { + // Get deletion function and apply it + const Task::function_dealloc_type d = (**lhs).m_dealloc ; + + (*d)( *lhs ); + } + } + else if ( count <= 0 ) { + msg_error = msg_error_count ; + } + } + + if ( 0 == msg_error && rhs ) { ++( rhs->m_ref_count ); } + + *lhs = rhs ; + } + catch( ... ) { + if ( 0 == msg_error ) msg_error = msg_error_exception ; + } + + if ( 0 != msg_error ) { + if ( no_throw ) { + std::cerr << msg_error_header << msg_error << std::endl ; + std::cerr.flush(); + } + else { + std::string msg(msg_error_header); + msg.append(msg_error); + throw std::runtime_error( msg ); + } + } +} +#endif + +namespace { + +Task * s_ready = 0 ; +Task * s_denied = reinterpret_cast<Task*>( ~((unsigned long)0) ); + +} + +void Task::schedule() +{ + // Execute ready tasks in case the task being scheduled + // is dependent upon a waiting and ready task. + + Task::execute_ready_tasks(); + + // spawning : Constructing -> Waiting + // respawning : Executing -> Waiting + // updating : Waiting -> Waiting + + // Must not be in a dependence linked list: 0 == t->m_next + + const bool ok_state = TASK_STATE_COMPLETE != m_state ; + const bool ok_list = 0 == m_next ; + + if ( ok_state && ok_list ) { + + // Will be waiting for execution upon return from this function + + m_state = Kokkos::Experimental::TASK_STATE_WAITING ; + + // Insert this task into another dependence that is not complete + + int i = 0 ; + for ( ; i < m_dep_size ; ++i ) { + Task * const y = m_dep[i] ; + if ( y && s_denied != ( m_next = y->m_wait ) ) { + y->m_wait = this ; // CAS( & y->m_wait , m_next , this ); + break ; + } + } + if ( i == m_dep_size ) { + // All dependences are complete, insert into the ready list + m_next = s_ready ; + s_ready = this ; // CAS( & s_ready , m_next = s_ready , this ); + } + } + else { + throw std::runtime_error(std::string("Kokkos::Experimental::Impl::Task spawn or respawn state error")); + } +} + +void Task::execute_ready_tasks() +{ + while ( s_ready ) { + + // Remove this task from the ready list + + // Task * task ; + // while ( ! CAS( & s_ready , task = s_ready , s_ready->m_next ) ); + + Task * const task = s_ready ; + s_ready = task->m_next ; + + task->m_next = 0 ; + + // precondition: task->m_state = TASK_STATE_WAITING + // precondition: task->m_dep[i]->m_state == TASK_STATE_COMPLETE for all i + // precondition: does not exist T such that T->m_wait = task + // precondition: does not exist T such that T->m_next = task + + task->m_state = Kokkos::Experimental::TASK_STATE_EXECUTING ; + + (*task->m_apply)( task ); + + if ( task->m_state == Kokkos::Experimental::TASK_STATE_EXECUTING ) { + // task did not respawn itself + task->m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ; + + // release dependences: + for ( int i = 0 ; i < task->m_dep_size ; ++i ) { + assign( task->m_dep + i , 0 ); + } + + // Stop other tasks from adding themselves to 'task->m_wait' ; + + Task * x ; + // CAS( & task->m_wait , x = task->m_wait , s_denied ); + x = task->m_wait ; task->m_wait = s_denied ; + + // update tasks waiting on this task + while ( x ) { + Task * const next = x->m_next ; + + x->m_next = 0 ; + + x->schedule(); // could happen concurrently + + x = next ; + } + } + } +} + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +#endif // defined( KOKKOS_HAVE_SERIAL ) diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp new file mode 100755 index 0000000000000000000000000000000000000000..4eec2f66bed30d1286bd97298625e51772781195 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp @@ -0,0 +1,845 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_SERIAL_TASKPOLICY_HPP +#define KOKKOS_SERIAL_TASKPOLICY_HPP + +#include <Kokkos_Macros.hpp> +#if defined( KOKKOS_HAVE_SERIAL ) + +#include <string> +#include <typeinfo> +#include <stdexcept> + +#include <Kokkos_Serial.hpp> +#include <Kokkos_TaskPolicy.hpp> +#include <Kokkos_View.hpp> + +#include <impl/Kokkos_FunctorAdapter.hpp> + +//---------------------------------------------------------------------------- +/* Inheritance structure to allow static_cast from the task root type + * and a task's FunctorType. + * + * task_root_type == TaskMember< Space , void , void > + * + * TaskMember< PolicyType , ResultType , FunctorType > + * : TaskMember< PolicyType::Space , ResultType , FunctorType > + * { ... }; + * + * TaskMember< Space , ResultType , FunctorType > + * : TaskMember< Space , ResultType , void > + * , FunctorType + * { ... }; + * + * when ResultType != void + * + * TaskMember< Space , ResultType , void > + * : TaskMember< Space , void , void > + * { ... }; + * + */ +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +/** \brief Base class for all tasks in the Serial execution space */ +template<> +class TaskMember< Kokkos::Serial , void , void > +{ +public: + + typedef void (* function_apply_type) ( TaskMember * ); + typedef void (* function_dealloc_type)( TaskMember * ); + typedef TaskMember * (* function_verify_type) ( TaskMember * ); + +private: + + const function_dealloc_type m_dealloc ; ///< Deallocation + const function_verify_type m_verify ; ///< Result type verification + const function_apply_type m_apply ; ///< Apply function + TaskMember ** const m_dep ; ///< Dependences + TaskMember * m_wait ; ///< Linked list of tasks waiting on this task + TaskMember * m_next ; ///< Linked list of tasks waiting on a different task + const int m_dep_capacity ; ///< Capacity of dependences + int m_dep_size ; ///< Actual count of dependences + int m_ref_count ; ///< Reference count + int m_state ; ///< State of the task + + // size = 6 Pointers + 4 ints + + TaskMember() /* = delete */ ; + TaskMember( const TaskMember & ) /* = delete */ ; + TaskMember & operator = ( const TaskMember & ) /* = delete */ ; + + static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity ); + static void deallocate( void * ); + + void throw_error_add_dependence() const ; + static void throw_error_verify_type(); + + template < class DerivedTaskType > + static + void deallocate( TaskMember * t ) + { + DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t); + ptr->~DerivedTaskType(); + deallocate( (void *) ptr ); + } + +protected : + + ~TaskMember(); + + // Used by TaskMember< Serial , ResultType , void > + TaskMember( const function_verify_type arg_verify + , const function_dealloc_type arg_dealloc + , const function_apply_type arg_apply + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + ); + + // Used for TaskMember< Serial , void , void > + TaskMember( const function_dealloc_type arg_dealloc + , const function_apply_type arg_apply + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + ); + +public: + + template< typename ResultType > + KOKKOS_FUNCTION static + TaskMember * verify_type( TaskMember * t ) + { + enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value }; + + if ( check_type && t != 0 ) { + + // Verify that t->m_verify is this function + const function_verify_type self = & TaskMember::template verify_type< ResultType > ; + + if ( t->m_verify != self ) { + t = 0 ; +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + throw_error_verify_type(); +#endif + } + } + return t ; + } + + //---------------------------------------- + /* Inheritence Requirements on task types: + * typedef FunctorType::value_type value_type ; + * class DerivedTaskType + * : public TaskMember< Serial , value_type , FunctorType > + * { ... }; + * class TaskMember< Serial , value_type , FunctorType > + * : public TaskMember< Serial , value_type , void > + * , public Functor + * { ... }; + * If value_type != void + * class TaskMember< Serial , value_type , void > + * : public TaskMember< Serial , void , void > + * + * Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ] + * + */ + + /** \brief Allocate and construct a single-thread task */ + template< class DerivedTaskType > + static + TaskMember * create( const typename DerivedTaskType::functor_type & arg_functor + , const unsigned arg_dependence_capacity + ) + { + typedef typename DerivedTaskType::functor_type functor_type ; + typedef typename functor_type::value_type value_type ; + + DerivedTaskType * const task = + new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) ) + DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType > + , & TaskMember::template apply_single< functor_type , value_type > + , sizeof(DerivedTaskType) + , arg_dependence_capacity + , arg_functor ); + + return static_cast< TaskMember * >( task ); + } + + /** \brief Allocate and construct a data parallel task */ + template< class DerivedTaskType > + static + TaskMember * create( const typename DerivedTaskType::policy_type & arg_policy + , const typename DerivedTaskType::functor_type & arg_functor + , const unsigned arg_dependence_capacity + ) + { + DerivedTaskType * const task = + new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) ) + DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType > + , sizeof(DerivedTaskType) + , arg_dependence_capacity + , arg_policy + , arg_functor + ); + + return static_cast< TaskMember * >( task ); + } + + /** \brief Allocate and construct a thread-team task */ + template< class DerivedTaskType > + static + TaskMember * create_team( const typename DerivedTaskType::functor_type & arg_functor + , const unsigned arg_dependence_capacity + ) + { + typedef typename DerivedTaskType::functor_type functor_type ; + typedef typename functor_type::value_type value_type ; + + DerivedTaskType * const task = + new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) ) + DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType > + , & TaskMember::template apply_team< functor_type , value_type > + , sizeof(DerivedTaskType) + , arg_dependence_capacity + , arg_functor ); + + return static_cast< TaskMember * >( task ); + } + + void schedule(); + static void execute_ready_tasks(); + + //---------------------------------------- + + typedef FutureValueTypeIsVoidError get_result_type ; + + KOKKOS_INLINE_FUNCTION + get_result_type get() const { return get_result_type() ; } + + KOKKOS_INLINE_FUNCTION + Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); } + + //---------------------------------------- + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + static + void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ); +#else + KOKKOS_INLINE_FUNCTION static + void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {} +#endif + + KOKKOS_INLINE_FUNCTION + TaskMember * get_dependence( int i ) const + { return ( Kokkos::Experimental::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; } + + KOKKOS_INLINE_FUNCTION + int get_dependence() const + { return m_dep_size ; } + + KOKKOS_INLINE_FUNCTION + void clear_dependence() + { + for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 ); + m_dep_size = 0 ; + } + + KOKKOS_INLINE_FUNCTION + void add_dependence( TaskMember * before ) + { + if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == m_state || + Kokkos::Experimental::TASK_STATE_EXECUTING == m_state ) && + m_dep_size < m_dep_capacity ) { + assign( m_dep + m_dep_size , before ); + ++m_dep_size ; + } + else { + throw_error_add_dependence(); + } + } + + //---------------------------------------- + + template< class FunctorType , class ResultType > + KOKKOS_INLINE_FUNCTION static + void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t ) + { + typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ; + + // TaskMember< Kokkos::Serial , ResultType , FunctorType > + // : public TaskMember< Kokkos::Serial , ResultType , void > + // , public FunctorType + // { ... }; + + derived_type & m = * static_cast< derived_type * >( t ); + + Kokkos::Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result ); + } + + template< class FunctorType , class ResultType > + KOKKOS_INLINE_FUNCTION static + void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t ) + { + typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ; + + // TaskMember< Kokkos::Serial , ResultType , FunctorType > + // : public TaskMember< Kokkos::Serial , ResultType , void > + // , public FunctorType + // { ... }; + + derived_type & m = * static_cast< derived_type * >( t ); + + Kokkos::Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m ); + } + + //---------------------------------------- + + template< class FunctorType , class ResultType > + static + void apply_team( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t ) + { + typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ; + typedef Kokkos::Impl::SerialTeamMember member_type ; + + // TaskMember< Kokkos::Serial , ResultType , FunctorType > + // : public TaskMember< Kokkos::Serial , ResultType , void > + // , public FunctorType + // { ... }; + + derived_type & m = * static_cast< derived_type * >( t ); + + m.FunctorType::apply( member_type(0,1,0) , m.m_result ); + } + + template< class FunctorType , class ResultType > + static + void apply_team( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t ) + { + typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ; + typedef Kokkos::Impl::SerialTeamMember member_type ; + + // TaskMember< Kokkos::Serial , ResultType , FunctorType > + // : public TaskMember< Kokkos::Serial , ResultType , void > + // , public FunctorType + // { ... }; + + derived_type & m = * static_cast< derived_type * >( t ); + + m.FunctorType::apply( member_type(0,1,0) ); + } +}; + +//---------------------------------------------------------------------------- +/** \brief Base class for tasks with a result value in the Serial execution space. + * + * The FunctorType must be void because this class is accessed by the + * Future class for the task and result value. + * + * Must be derived from TaskMember<S,void,void> 'root class' so the Future class + * can correctly static_cast from the 'root class' to this class. + */ +template < class ResultType > +class TaskMember< Kokkos::Serial , ResultType , void > + : public TaskMember< Kokkos::Serial , void , void > +{ +public: + + ResultType m_result ; + + typedef const ResultType & get_result_type ; + + KOKKOS_INLINE_FUNCTION + get_result_type get() const { return m_result ; } + +protected: + + typedef TaskMember< Kokkos::Serial , void , void > task_root_type ; + typedef task_root_type::function_dealloc_type function_dealloc_type ; + typedef task_root_type::function_apply_type function_apply_type ; + + inline + TaskMember( const function_dealloc_type arg_dealloc + , const function_apply_type arg_apply + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + ) + : task_root_type( & task_root_type::template verify_type< ResultType > + , arg_dealloc + , arg_apply + , arg_sizeof_derived + , arg_dependence_capacity ) + , m_result() + {} +}; + +template< class ResultType , class FunctorType > +class TaskMember< Kokkos::Serial , ResultType , FunctorType > + : public TaskMember< Kokkos::Serial , ResultType , void > + , public FunctorType +{ +public: + + typedef FunctorType functor_type ; + + typedef TaskMember< Kokkos::Serial , void , void > task_root_type ; + typedef TaskMember< Kokkos::Serial , ResultType , void > task_base_type ; + typedef task_root_type::function_dealloc_type function_dealloc_type ; + typedef task_root_type::function_apply_type function_apply_type ; + + inline + TaskMember( const function_dealloc_type arg_dealloc + , const function_apply_type arg_apply + , const unsigned arg_sizeof_derived + , const unsigned arg_dependence_capacity + , const functor_type & arg_functor + ) + : task_base_type( arg_dealloc , arg_apply , arg_sizeof_derived , arg_dependence_capacity ) + , functor_type( arg_functor ) + {} +}; + +//---------------------------------------------------------------------------- +/** \brief ForEach task in the Serial execution space + * + * Derived from TaskMember< Kokkos::Serial , ResultType , FunctorType > + * so that Functor can be cast to task root type without knowing policy. + */ +template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType > +class TaskForEach< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > + , ResultType + , FunctorType > + : TaskMember< Kokkos::Serial , ResultType , FunctorType > +{ +public: + + typedef FunctorType functor_type ; + typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > policy_type ; + +private: + + friend class Kokkos::Experimental::TaskPolicy< Kokkos::Serial > ; + friend class Kokkos::Experimental::Impl::TaskMember< Kokkos::Serial , void , void > ; + + typedef TaskMember< Kokkos::Serial , void , void > task_root_type ; + typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > task_base_type ; + typedef task_root_type::function_dealloc_type function_dealloc_type ; + + policy_type m_policy ; + + template< class Tag > + inline + typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same<Tag,void>::value >::type + apply_policy() const + { + const typename policy_type::member_type e = m_policy.end(); + for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) { + functor_type::operator()(i); + } + } + + template< class Tag > + inline + typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same<Tag,void>::value >::type + apply_policy() const + { + const Tag tag ; + const typename policy_type::member_type e = m_policy.end(); + for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) { + functor_type::operator()(tag,i); + } + } + + static + void apply_parallel( task_root_type * t ) + { + static_cast<TaskForEach*>(t)->template apply_policy< typename policy_type::work_tag >(); + + task_root_type::template apply_single< functor_type , ResultType >( t ); + } + + TaskForEach( const function_dealloc_type arg_dealloc + , const int arg_sizeof_derived + , const int arg_dependence_capacity + , const policy_type & arg_policy + , const functor_type & arg_functor + ) + : task_base_type( arg_dealloc + , & apply_parallel + , arg_sizeof_derived + , arg_dependence_capacity + , arg_functor ) + , m_policy( arg_policy ) + {} + + TaskForEach() /* = delete */ ; + TaskForEach( const TaskForEach & ) /* = delete */ ; + TaskForEach & operator = ( const TaskForEach & ) /* = delete */ ; +}; + +//---------------------------------------------------------------------------- +/** \brief Reduce task in the Serial execution space + * + * Derived from TaskMember< Kokkos::Serial , ResultType , FunctorType > + * so that Functor can be cast to task root type without knowing policy. + */ +template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType > +class TaskReduce< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > + , ResultType + , FunctorType > + : TaskMember< Kokkos::Serial , ResultType , FunctorType > +{ +public: + + typedef FunctorType functor_type ; + typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > policy_type ; + +private: + + friend class Kokkos::Experimental::TaskPolicy< Kokkos::Serial > ; + friend class Kokkos::Experimental::Impl::TaskMember< Kokkos::Serial , void , void > ; + + typedef TaskMember< Kokkos::Serial , void , void > task_root_type ; + typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > task_base_type ; + typedef task_root_type::function_dealloc_type function_dealloc_type ; + + policy_type m_policy ; + + template< class Tag > + inline + void apply_policy( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same<Tag,void>::value , ResultType & >::type result ) const + { + Kokkos::Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result ); + const typename policy_type::member_type e = m_policy.end(); + for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) { + functor_type::operator()( i, result ); + } + } + + template< class Tag > + inline + void apply_policy( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same<Tag,void>::value , ResultType & >::type result ) const + { + Kokkos::Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result ); + const Tag tag ; + const typename policy_type::member_type e = m_policy.end(); + for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) { + functor_type::operator()( tag, i, result ); + } + } + + static + void apply_parallel( task_root_type * t ) + { + TaskReduce * const task = static_cast<TaskReduce*>(t); + + task->template apply_policy< typename policy_type::work_tag >( task->task_base_type::m_result ); + + task_root_type::template apply_single< functor_type , ResultType >( t ); + } + + TaskReduce( const function_dealloc_type arg_dealloc + , const int arg_sizeof_derived + , const int arg_dependence_capacity + , const policy_type & arg_policy + , const functor_type & arg_functor + ) + : task_base_type( arg_dealloc + , & apply_parallel + , arg_sizeof_derived + , arg_dependence_capacity + , arg_functor ) + , m_policy( arg_policy ) + {} + + TaskReduce() /* = delete */ ; + TaskReduce( const TaskReduce & ) /* = delete */ ; + TaskReduce & operator = ( const TaskReduce & ) /* = delete */ ; +}; + +} /* namespace Impl */ +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +template<> +class TaskPolicy< Kokkos::Serial > +{ +public: + + typedef Kokkos::Serial execution_space ; + typedef Kokkos::Impl::SerialTeamMember member_type ; + +private: + + typedef Impl::TaskMember< execution_space , void , void > task_root_type ; + + template< class FunctorType > + static inline + const task_root_type * get_task_root( const FunctorType * f ) + { + typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ; + return static_cast< const task_root_type * >( static_cast< const task_type * >(f) ); + } + + template< class FunctorType > + static inline + task_root_type * get_task_root( FunctorType * f ) + { + typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ; + return static_cast< task_root_type * >( static_cast< task_type * >(f) ); + } + + unsigned m_default_dependence_capacity ; + +public: + + KOKKOS_INLINE_FUNCTION + TaskPolicy() : m_default_dependence_capacity(4) {} + + KOKKOS_INLINE_FUNCTION + TaskPolicy( const TaskPolicy & rhs ) : m_default_dependence_capacity( rhs.m_default_dependence_capacity ) {} + + KOKKOS_INLINE_FUNCTION + explicit + TaskPolicy( const unsigned arg_default_dependence_capacity ) + : m_default_dependence_capacity( arg_default_dependence_capacity ) {} + + KOKKOS_INLINE_FUNCTION + TaskPolicy( const TaskPolicy & + , const unsigned arg_default_dependence_capacity ) + : m_default_dependence_capacity( arg_default_dependence_capacity ) {} + + TaskPolicy & operator = ( const TaskPolicy &rhs ) + { + m_default_dependence_capacity = rhs.m_default_dependence_capacity; + return *this; + } + + //---------------------------------------- + + template< class ValueType > + KOKKOS_INLINE_FUNCTION + const Future< ValueType , execution_space > & + spawn( const Future< ValueType , execution_space > & f ) const + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + f.m_task->schedule(); +#endif + return f ; + } + + // Create single-thread task + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + Future< typename FunctorType::value_type , execution_space > + create( const FunctorType & functor + , const unsigned dependence_capacity = ~0u ) const + { + typedef typename FunctorType::value_type value_type ; + typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ; + return Future< value_type , execution_space >( +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + task_root_type::create< task_type >( + functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) ) +#endif + ); + } + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + Future< typename FunctorType::value_type , execution_space > + create_team( const FunctorType & functor + , const unsigned dependence_capacity = ~0u ) const + { + typedef typename FunctorType::value_type value_type ; + typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ; + return Future< value_type , execution_space >( +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + task_root_type::create_team< task_type >( + functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) ) +#endif + ); + } + + // Create parallel foreach task + + template< class PolicyType , class FunctorType > + KOKKOS_INLINE_FUNCTION + Future< typename FunctorType::value_type , execution_space > + create_foreach( const PolicyType & policy + , const FunctorType & functor + , const unsigned dependence_capacity = ~0u ) const + { + typedef typename FunctorType::value_type value_type ; + typedef Impl::TaskForEach< PolicyType , value_type , FunctorType > task_type ; + return Future< value_type , execution_space >( +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + task_root_type::create< task_type >( policy , functor , + ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) ) +#endif + ); + } + + // Create parallel reduce task + + template< class PolicyType , class FunctorType > + KOKKOS_INLINE_FUNCTION + Future< typename FunctorType::value_type , execution_space > + create_reduce( const PolicyType & policy + , const FunctorType & functor + , const unsigned dependence_capacity = ~0u ) const + { + typedef typename FunctorType::value_type value_type ; + typedef Impl::TaskReduce< PolicyType , value_type , FunctorType > task_type ; + return Future< value_type , execution_space >( +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + task_root_type::create< task_type >( policy , functor , + ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) ) +#endif + ); + } + + // Add dependence + template< class A1 , class A2 , class A3 , class A4 > + KOKKOS_INLINE_FUNCTION + void add_dependence( const Future<A1,A2> & after + , const Future<A3,A4> & before + , typename Kokkos::Impl::enable_if + < Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value + && + Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value + >::type * = 0 + ) const + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + after.m_task->add_dependence( before.m_task ); +#endif + } + + //---------------------------------------- + // Functions for an executing task functor to query dependences, + // set new dependences, and respawn itself. + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + Future< void , execution_space > + get_dependence( const FunctorType * task_functor , int i ) const + { + return Future<void,execution_space>( +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + get_task_root(task_functor)->get_dependence(i) +#endif + ); + } + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + int get_dependence( const FunctorType * task_functor ) const +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { return get_task_root(task_functor)->get_dependence(); } +#else + { return 0 ; } +#endif + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + void clear_dependence( FunctorType * task_functor ) const +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { get_task_root(task_functor)->clear_dependence(); } +#else + {} +#endif + + template< class FunctorType , class A3 , class A4 > + KOKKOS_INLINE_FUNCTION + void add_dependence( FunctorType * task_functor + , const Future<A3,A4> & before + , typename Kokkos::Impl::enable_if + < Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value + >::type * = 0 + ) const +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { get_task_root(task_functor)->add_dependence( before.m_task ); } +#else + {} +#endif + + template< class FunctorType > + KOKKOS_INLINE_FUNCTION + void respawn( FunctorType * task_functor ) const +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { get_task_root(task_functor)->schedule(); } +#else + {} +#endif + + //---------------------------------------- + + static member_type & member_single(); +}; + +inline +void wait( TaskPolicy< Kokkos::Serial > & ) +{ Impl::TaskMember< Kokkos::Serial , void , void >::execute_ready_tasks(); } + +} /* namespace Experimental */ +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +#endif /* defined( KOKKOS_HAVE_SERIAL ) */ +#endif /* #define KOKKOS_SERIAL_TASK_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_Shape.cpp b/lib/kokkos/core/src/impl/Kokkos_Shape.cpp new file mode 100755 index 0000000000000000000000000000000000000000..da12db1f381e790e46604f8a15280d2a07f5152a --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Shape.cpp @@ -0,0 +1,178 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#include <sstream> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_Shape.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +void assert_counts_are_equal_throw( + const size_t x_count , + const size_t y_count ) +{ + std::ostringstream msg ; + + msg << "Kokkos::Impl::assert_counts_are_equal_throw( " + << x_count << " != " << y_count << " )" ; + + throw_runtime_exception( msg.str() ); +} + +void assert_shapes_are_equal_throw( + const unsigned x_scalar_size , + const unsigned x_rank , + const size_t x_N0 , const unsigned x_N1 , + const unsigned x_N2 , const unsigned x_N3 , + const unsigned x_N4 , const unsigned x_N5 , + const unsigned x_N6 , const unsigned x_N7 , + + const unsigned y_scalar_size , + const unsigned y_rank , + const size_t y_N0 , const unsigned y_N1 , + const unsigned y_N2 , const unsigned y_N3 , + const unsigned y_N4 , const unsigned y_N5 , + const unsigned y_N6 , const unsigned y_N7 ) +{ + std::ostringstream msg ; + + msg << "Kokkos::Impl::assert_shape_are_equal_throw( {" + << " scalar_size(" << x_scalar_size + << ") rank(" << x_rank + << ") dimension(" ; + if ( 0 < x_rank ) { msg << " " << x_N0 ; } + if ( 1 < x_rank ) { msg << " " << x_N1 ; } + if ( 2 < x_rank ) { msg << " " << x_N2 ; } + if ( 3 < x_rank ) { msg << " " << x_N3 ; } + if ( 4 < x_rank ) { msg << " " << x_N4 ; } + if ( 5 < x_rank ) { msg << " " << x_N5 ; } + if ( 6 < x_rank ) { msg << " " << x_N6 ; } + if ( 7 < x_rank ) { msg << " " << x_N7 ; } + msg << " ) } != { " + << " scalar_size(" << y_scalar_size + << ") rank(" << y_rank + << ") dimension(" ; + if ( 0 < y_rank ) { msg << " " << y_N0 ; } + if ( 1 < y_rank ) { msg << " " << y_N1 ; } + if ( 2 < y_rank ) { msg << " " << y_N2 ; } + if ( 3 < y_rank ) { msg << " " << y_N3 ; } + if ( 4 < y_rank ) { msg << " " << y_N4 ; } + if ( 5 < y_rank ) { msg << " " << y_N5 ; } + if ( 6 < y_rank ) { msg << " " << y_N6 ; } + if ( 7 < y_rank ) { msg << " " << y_N7 ; } + msg << " ) } )" ; + + throw_runtime_exception( msg.str() ); +} + +void AssertShapeBoundsAbort< Kokkos::HostSpace >::apply( + const size_t rank , + const size_t n0 , const size_t n1 , + const size_t n2 , const size_t n3 , + const size_t n4 , const size_t n5 , + const size_t n6 , const size_t n7 , + + const size_t arg_rank , + const size_t i0 , const size_t i1 , + const size_t i2 , const size_t i3 , + const size_t i4 , const size_t i5 , + const size_t i6 , const size_t i7 ) +{ + std::ostringstream msg ; + msg << "Kokkos::Impl::AssertShapeBoundsAbort( shape = {" ; + if ( 0 < rank ) { msg << " " << n0 ; } + if ( 1 < rank ) { msg << " " << n1 ; } + if ( 2 < rank ) { msg << " " << n2 ; } + if ( 3 < rank ) { msg << " " << n3 ; } + if ( 4 < rank ) { msg << " " << n4 ; } + if ( 5 < rank ) { msg << " " << n5 ; } + if ( 6 < rank ) { msg << " " << n6 ; } + if ( 7 < rank ) { msg << " " << n7 ; } + msg << " } index = {" ; + if ( 0 < arg_rank ) { msg << " " << i0 ; } + if ( 1 < arg_rank ) { msg << " " << i1 ; } + if ( 2 < arg_rank ) { msg << " " << i2 ; } + if ( 3 < arg_rank ) { msg << " " << i3 ; } + if ( 4 < arg_rank ) { msg << " " << i4 ; } + if ( 5 < arg_rank ) { msg << " " << i5 ; } + if ( 6 < arg_rank ) { msg << " " << i6 ; } + if ( 7 < arg_rank ) { msg << " " << i7 ; } + msg << " } )" ; + + throw_runtime_exception( msg.str() ); +} + +void assert_shape_effective_rank1_at_leastN_throw( + const size_t x_rank , const size_t x_N0 , + const size_t x_N1 , const size_t x_N2 , + const size_t x_N3 , const size_t x_N4 , + const size_t x_N5 , const size_t x_N6 , + const size_t x_N7 , + const size_t N0 ) +{ + std::ostringstream msg ; + + msg << "Kokkos::Impl::assert_shape_effective_rank1_at_leastN_throw( shape = {" ; + if ( 0 < x_rank ) { msg << " " << x_N0 ; } + if ( 1 < x_rank ) { msg << " " << x_N1 ; } + if ( 2 < x_rank ) { msg << " " << x_N2 ; } + if ( 3 < x_rank ) { msg << " " << x_N3 ; } + if ( 4 < x_rank ) { msg << " " << x_N4 ; } + if ( 5 < x_rank ) { msg << " " << x_N5 ; } + if ( 6 < x_rank ) { msg << " " << x_N6 ; } + if ( 7 < x_rank ) { msg << " " << x_N7 ; } + msg << " } N = " << N0 << " )" ; + + throw_runtime_exception( msg.str() ); +} + + + +} +} + diff --git a/lib/kokkos/core/src/impl/Kokkos_Shape.hpp b/lib/kokkos/core/src/impl/Kokkos_Shape.hpp new file mode 100755 index 0000000000000000000000000000000000000000..dba73012701776b028b0f3cbc109e2b9c6231644 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Shape.hpp @@ -0,0 +1,917 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SHAPE_HPP +#define KOKKOS_SHAPE_HPP + +#include <typeinfo> +#include <utility> +#include <Kokkos_Core_fwd.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_StaticAssert.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +/** \brief The shape of a Kokkos with dynamic and static dimensions. + * Dynamic dimensions are member values and static dimensions are + * 'static const' values. + * + * The upper bound on the array rank is eight. + */ +template< unsigned ScalarSize , + unsigned Rank , + unsigned s0 = 1 , + unsigned s1 = 1 , + unsigned s2 = 1 , + unsigned s3 = 1 , + unsigned s4 = 1 , + unsigned s5 = 1 , + unsigned s6 = 1 , + unsigned s7 = 1 > +struct Shape ; + +//---------------------------------------------------------------------------- +/** \brief Shape equality if the value type, layout, and dimensions + * are equal. + */ +template< unsigned xSize , unsigned xRank , + unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 , + unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 , + + unsigned ySize , unsigned yRank , + unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 , + unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 > +KOKKOS_INLINE_FUNCTION +bool operator == ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x , + const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y ) +{ + enum { same_size = xSize == ySize }; + enum { same_rank = xRank == yRank }; + + return same_size && same_rank && + size_t( x.N0 ) == size_t( y.N0 ) && + unsigned( x.N1 ) == unsigned( y.N1 ) && + unsigned( x.N2 ) == unsigned( y.N2 ) && + unsigned( x.N3 ) == unsigned( y.N3 ) && + unsigned( x.N4 ) == unsigned( y.N4 ) && + unsigned( x.N5 ) == unsigned( y.N5 ) && + unsigned( x.N6 ) == unsigned( y.N6 ) && + unsigned( x.N7 ) == unsigned( y.N7 ) ; +} + +template< unsigned xSize , unsigned xRank , + unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 , + unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 , + + unsigned ySize ,unsigned yRank , + unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 , + unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 > +KOKKOS_INLINE_FUNCTION +bool operator != ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x , + const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y ) +{ return ! operator == ( x , y ); } + +//---------------------------------------------------------------------------- + +void assert_counts_are_equal_throw( + const size_t x_count , + const size_t y_count ); + +inline +void assert_counts_are_equal( + const size_t x_count , + const size_t y_count ) +{ + if ( x_count != y_count ) { + assert_counts_are_equal_throw( x_count , y_count ); + } +} + +void assert_shapes_are_equal_throw( + const unsigned x_scalar_size , + const unsigned x_rank , + const size_t x_N0 , const unsigned x_N1 , + const unsigned x_N2 , const unsigned x_N3 , + const unsigned x_N4 , const unsigned x_N5 , + const unsigned x_N6 , const unsigned x_N7 , + + const unsigned y_scalar_size , + const unsigned y_rank , + const size_t y_N0 , const unsigned y_N1 , + const unsigned y_N2 , const unsigned y_N3 , + const unsigned y_N4 , const unsigned y_N5 , + const unsigned y_N6 , const unsigned y_N7 ); + +template< unsigned xSize , unsigned xRank , + unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 , + unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 , + + unsigned ySize , unsigned yRank , + unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 , + unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 > +inline +void assert_shapes_are_equal( + const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x , + const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y ) +{ + typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ; + typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ; + + if ( x != y ) { + assert_shapes_are_equal_throw( + x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7, + y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 ); + } +} + +template< unsigned xSize , unsigned xRank , + unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 , + unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 , + + unsigned ySize , unsigned yRank , + unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 , + unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 > +void assert_shapes_equal_dimension( + const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x , + const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y ) +{ + typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ; + typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ; + + // Omit comparison of scalar_size. + if ( unsigned( x.rank ) != unsigned( y.rank ) || + size_t( x.N0 ) != size_t( y.N0 ) || + unsigned( x.N1 ) != unsigned( y.N1 ) || + unsigned( x.N2 ) != unsigned( y.N2 ) || + unsigned( x.N3 ) != unsigned( y.N3 ) || + unsigned( x.N4 ) != unsigned( y.N4 ) || + unsigned( x.N5 ) != unsigned( y.N5 ) || + unsigned( x.N6 ) != unsigned( y.N6 ) || + unsigned( x.N7 ) != unsigned( y.N7 ) ) { + assert_shapes_are_equal_throw( + x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7, + y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 ); + } +} + +//---------------------------------------------------------------------------- + +template< class ShapeType > struct assert_shape_is_rank_zero ; +template< class ShapeType > struct assert_shape_is_rank_one ; + +template< unsigned Size > +struct assert_shape_is_rank_zero< Shape<Size,0> > + : public true_type {}; + +template< unsigned Size , unsigned s0 > +struct assert_shape_is_rank_one< Shape<Size,1,s0> > + : public true_type {}; + +//---------------------------------------------------------------------------- + +/** \brief Array bounds assertion templated on the execution space + * to allow device-specific abort code. + */ +template< class Space > +struct AssertShapeBoundsAbort ; + +template<> +struct AssertShapeBoundsAbort< Kokkos::HostSpace > +{ + static void apply( const size_t rank , + const size_t n0 , const size_t n1 , + const size_t n2 , const size_t n3 , + const size_t n4 , const size_t n5 , + const size_t n6 , const size_t n7 , + const size_t arg_rank , + const size_t i0 , const size_t i1 , + const size_t i2 , const size_t i3 , + const size_t i4 , const size_t i5 , + const size_t i6 , const size_t i7 ); +}; + +template< class ExecutionSpace > +struct AssertShapeBoundsAbort +{ + KOKKOS_INLINE_FUNCTION + static void apply( const size_t rank , + const size_t n0 , const size_t n1 , + const size_t n2 , const size_t n3 , + const size_t n4 , const size_t n5 , + const size_t n6 , const size_t n7 , + const size_t arg_rank , + const size_t i0 , const size_t i1 , + const size_t i2 , const size_t i3 , + const size_t i4 , const size_t i5 , + const size_t i6 , const size_t i7 ) + { + AssertShapeBoundsAbort< Kokkos::HostSpace > + ::apply( rank , n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 , + arg_rank, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 ); + } +}; + +template< class ShapeType > +KOKKOS_INLINE_FUNCTION +void assert_shape_bounds( const ShapeType & shape , + const size_t arg_rank , + const size_t i0 , + const size_t i1 = 0 , + const size_t i2 = 0 , + const size_t i3 = 0 , + const size_t i4 = 0 , + const size_t i5 = 0 , + const size_t i6 = 0 , + const size_t i7 = 0 ) +{ + // Must supply at least as many indices as ranks. + // Every index must be within bounds. + const bool ok = ShapeType::rank <= arg_rank && + i0 < shape.N0 && + i1 < shape.N1 && + i2 < shape.N2 && + i3 < shape.N3 && + i4 < shape.N4 && + i5 < shape.N5 && + i6 < shape.N6 && + i7 < shape.N7 ; + + if ( ! ok ) { + AssertShapeBoundsAbort< Kokkos::Impl::ActiveExecutionMemorySpace > + ::apply( ShapeType::rank , + shape.N0 , shape.N1 , shape.N2 , shape.N3 , + shape.N4 , shape.N5 , shape.N6 , shape.N7 , + arg_rank , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 ); + } +} + +#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) +#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) assert_shape_bounds(S,1,I0); +#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) assert_shape_bounds(S,2,I0,I1); +#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) assert_shape_bounds(S,3,I0,I1,I2); +#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) assert_shape_bounds(S,4,I0,I1,I2,I3); +#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) assert_shape_bounds(S,5,I0,I1,I2,I3,I4); +#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) assert_shape_bounds(S,6,I0,I1,I2,I3,I4,I5); +#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) assert_shape_bounds(S,7,I0,I1,I2,I3,I4,I5,I6); +#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) assert_shape_bounds(S,8,I0,I1,I2,I3,I4,I5,I6,I7); +#else +#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) /* */ +#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) /* */ +#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) /* */ +#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) /* */ +#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) /* */ +#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) /* */ +#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) /* */ +#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) /* */ +#endif + + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// Specialization and optimization for the Rank 0 shape. + +template < unsigned ScalarSize > +struct Shape< ScalarSize , 0, 1,1,1,1, 1,1,1,1 > +{ + enum { scalar_size = ScalarSize }; + enum { rank_dynamic = 0 }; + enum { rank = 0 }; + + enum { N0 = 1 }; + enum { N1 = 1 }; + enum { N2 = 1 }; + enum { N3 = 1 }; + enum { N4 = 1 }; + enum { N5 = 1 }; + enum { N6 = 1 }; + enum { N7 = 1 }; + + KOKKOS_INLINE_FUNCTION + static + void assign( Shape & , + unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 , + unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ) + {} +}; + +//---------------------------------------------------------------------------- + +template< unsigned R > struct assign_shape_dimension ; + +#define KOKKOS_ASSIGN_SHAPE_DIMENSION( R ) \ +template<> \ +struct assign_shape_dimension< R > \ +{ \ + template< class ShapeType > \ + KOKKOS_INLINE_FUNCTION \ + assign_shape_dimension( ShapeType & shape \ + , typename Impl::enable_if<( R < ShapeType::rank_dynamic ), size_t >::type n \ + ) { shape.N ## R = n ; } \ +}; + +KOKKOS_ASSIGN_SHAPE_DIMENSION(0) +KOKKOS_ASSIGN_SHAPE_DIMENSION(1) +KOKKOS_ASSIGN_SHAPE_DIMENSION(2) +KOKKOS_ASSIGN_SHAPE_DIMENSION(3) +KOKKOS_ASSIGN_SHAPE_DIMENSION(4) +KOKKOS_ASSIGN_SHAPE_DIMENSION(5) +KOKKOS_ASSIGN_SHAPE_DIMENSION(6) +KOKKOS_ASSIGN_SHAPE_DIMENSION(7) + +#undef KOKKOS_ASSIGN_SHAPE_DIMENSION + +//---------------------------------------------------------------------------- +// All-static dimension array + +template < unsigned ScalarSize , + unsigned Rank , + unsigned s0 , + unsigned s1 , + unsigned s2 , + unsigned s3 , + unsigned s4 , + unsigned s5 , + unsigned s6 , + unsigned s7 > +struct Shape { + + enum { scalar_size = ScalarSize }; + enum { rank_dynamic = 0 }; + enum { rank = Rank }; + + enum { N0 = s0 }; + enum { N1 = s1 }; + enum { N2 = s2 }; + enum { N3 = s3 }; + enum { N4 = s4 }; + enum { N5 = s5 }; + enum { N6 = s6 }; + enum { N7 = s7 }; + + KOKKOS_INLINE_FUNCTION + static + void assign( Shape & , + unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 , + unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ) + {} +}; + +// 1 == dynamic_rank <= rank <= 8 +template < unsigned ScalarSize , + unsigned Rank , + unsigned s1 , + unsigned s2 , + unsigned s3 , + unsigned s4 , + unsigned s5 , + unsigned s6 , + unsigned s7 > +struct Shape< ScalarSize , Rank , 0,s1,s2,s3, s4,s5,s6,s7 > +{ + enum { scalar_size = ScalarSize }; + enum { rank_dynamic = 1 }; + enum { rank = Rank }; + + size_t N0 ; // For 1 == dynamic_rank allow N0 > 2^32 + + enum { N1 = s1 }; + enum { N2 = s2 }; + enum { N3 = s3 }; + enum { N4 = s4 }; + enum { N5 = s5 }; + enum { N6 = s6 }; + enum { N7 = s7 }; + + KOKKOS_INLINE_FUNCTION + static + void assign( Shape & s , + size_t n0 , unsigned = 0 , unsigned = 0 , unsigned = 0 , + unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ) + { s.N0 = n0 ; } +}; + +// 2 == dynamic_rank <= rank <= 8 +template < unsigned ScalarSize , unsigned Rank , + unsigned s2 , + unsigned s3 , + unsigned s4 , + unsigned s5 , + unsigned s6 , + unsigned s7 > +struct Shape< ScalarSize , Rank , 0,0,s2,s3, s4,s5,s6,s7 > +{ + enum { scalar_size = ScalarSize }; + enum { rank_dynamic = 2 }; + enum { rank = Rank }; + + unsigned N0 ; + unsigned N1 ; + + enum { N2 = s2 }; + enum { N3 = s3 }; + enum { N4 = s4 }; + enum { N5 = s5 }; + enum { N6 = s6 }; + enum { N7 = s7 }; + + KOKKOS_INLINE_FUNCTION + static + void assign( Shape & s , + unsigned n0 , unsigned n1 , unsigned = 0 , unsigned = 0 , + unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ) + { s.N0 = n0 ; s.N1 = n1 ; } +}; + +// 3 == dynamic_rank <= rank <= 8 +template < unsigned Rank , unsigned ScalarSize , + unsigned s3 , + unsigned s4 , + unsigned s5 , + unsigned s6 , + unsigned s7 > +struct Shape< ScalarSize , Rank , 0,0,0,s3, s4,s5,s6,s7> +{ + enum { scalar_size = ScalarSize }; + enum { rank_dynamic = 3 }; + enum { rank = Rank }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + + enum { N3 = s3 }; + enum { N4 = s4 }; + enum { N5 = s5 }; + enum { N6 = s6 }; + enum { N7 = s7 }; + + KOKKOS_INLINE_FUNCTION + static + void assign( Shape & s , + unsigned n0 , unsigned n1 , unsigned n2 , unsigned = 0 , + unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ) + { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; } +}; + +// 4 == dynamic_rank <= rank <= 8 +template < unsigned ScalarSize , unsigned Rank , + unsigned s4 , + unsigned s5 , + unsigned s6 , + unsigned s7 > +struct Shape< ScalarSize , Rank, 0,0,0,0, s4,s5,s6,s7 > +{ + enum { scalar_size = ScalarSize }; + enum { rank_dynamic = 4 }; + enum { rank = Rank }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + unsigned N3 ; + + enum { N4 = s4 }; + enum { N5 = s5 }; + enum { N6 = s6 }; + enum { N7 = s7 }; + + KOKKOS_INLINE_FUNCTION + static + void assign( Shape & s , + unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 , + unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ) + { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; } +}; + +// 5 == dynamic_rank <= rank <= 8 +template < unsigned ScalarSize , unsigned Rank , + unsigned s5 , + unsigned s6 , + unsigned s7 > +struct Shape< ScalarSize , Rank , 0,0,0,0, 0,s5,s6,s7 > +{ + enum { scalar_size = ScalarSize }; + enum { rank_dynamic = 5 }; + enum { rank = Rank }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + unsigned N3 ; + unsigned N4 ; + + enum { N5 = s5 }; + enum { N6 = s6 }; + enum { N7 = s7 }; + + KOKKOS_INLINE_FUNCTION + static + void assign( Shape & s , + unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 , + unsigned n4 , unsigned = 0 , unsigned = 0 , unsigned = 0 ) + { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; s.N4 = n4 ; } +}; + +// 6 == dynamic_rank <= rank <= 8 +template < unsigned ScalarSize , unsigned Rank , + unsigned s6 , + unsigned s7 > +struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,s6,s7 > +{ + enum { scalar_size = ScalarSize }; + enum { rank_dynamic = 6 }; + enum { rank = Rank }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + unsigned N3 ; + unsigned N4 ; + unsigned N5 ; + + enum { N6 = s6 }; + enum { N7 = s7 }; + + KOKKOS_INLINE_FUNCTION + static + void assign( Shape & s , + unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 , + unsigned n4 , unsigned n5 = 0 , unsigned = 0 , unsigned = 0 ) + { + s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; + s.N4 = n4 ; s.N5 = n5 ; + } +}; + +// 7 == dynamic_rank <= rank <= 8 +template < unsigned ScalarSize , unsigned Rank , + unsigned s7 > +struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,0,s7 > +{ + enum { scalar_size = ScalarSize }; + enum { rank_dynamic = 7 }; + enum { rank = Rank }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + unsigned N3 ; + unsigned N4 ; + unsigned N5 ; + unsigned N6 ; + + enum { N7 = s7 }; + + KOKKOS_INLINE_FUNCTION + static + void assign( Shape & s , + unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 , + unsigned n4 , unsigned n5 , unsigned n6 , unsigned = 0 ) + { + s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; + s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ; + } +}; + +// 8 == dynamic_rank <= rank <= 8 +template < unsigned ScalarSize > +struct Shape< ScalarSize , 8 , 0,0,0,0, 0,0,0,0 > +{ + enum { scalar_size = ScalarSize }; + enum { rank_dynamic = 8 }; + enum { rank = 8 }; + + unsigned N0 ; + unsigned N1 ; + unsigned N2 ; + unsigned N3 ; + unsigned N4 ; + unsigned N5 ; + unsigned N6 ; + unsigned N7 ; + + KOKKOS_INLINE_FUNCTION + static + void assign( Shape & s , + unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 , + unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7 ) + { + s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; + s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ; s.N7 = n7 ; + } +}; + +//---------------------------------------------------------------------------- + +template< class ShapeType , unsigned N , + unsigned R = ShapeType::rank_dynamic > +struct ShapeInsert ; + +template< class ShapeType , unsigned N > +struct ShapeInsert< ShapeType , N , 0 > +{ + typedef Shape< ShapeType::scalar_size , + ShapeType::rank + 1 , + N , + ShapeType::N0 , + ShapeType::N1 , + ShapeType::N2 , + ShapeType::N3 , + ShapeType::N4 , + ShapeType::N5 , + ShapeType::N6 > type ; +}; + +template< class ShapeType , unsigned N > +struct ShapeInsert< ShapeType , N , 1 > +{ + typedef Shape< ShapeType::scalar_size , + ShapeType::rank + 1 , + 0 , + N , + ShapeType::N1 , + ShapeType::N2 , + ShapeType::N3 , + ShapeType::N4 , + ShapeType::N5 , + ShapeType::N6 > type ; +}; + +template< class ShapeType , unsigned N > +struct ShapeInsert< ShapeType , N , 2 > +{ + typedef Shape< ShapeType::scalar_size , + ShapeType::rank + 1 , + 0 , + 0 , + N , + ShapeType::N2 , + ShapeType::N3 , + ShapeType::N4 , + ShapeType::N5 , + ShapeType::N6 > type ; +}; + +template< class ShapeType , unsigned N > +struct ShapeInsert< ShapeType , N , 3 > +{ + typedef Shape< ShapeType::scalar_size , + ShapeType::rank + 1 , + 0 , + 0 , + 0 , + N , + ShapeType::N3 , + ShapeType::N4 , + ShapeType::N5 , + ShapeType::N6 > type ; +}; + +template< class ShapeType , unsigned N > +struct ShapeInsert< ShapeType , N , 4 > +{ + typedef Shape< ShapeType::scalar_size , + ShapeType::rank + 1 , + 0 , + 0 , + 0 , + 0 , + N , + ShapeType::N4 , + ShapeType::N5 , + ShapeType::N6 > type ; +}; + +template< class ShapeType , unsigned N > +struct ShapeInsert< ShapeType , N , 5 > +{ + typedef Shape< ShapeType::scalar_size , + ShapeType::rank + 1 , + 0 , + 0 , + 0 , + 0 , + 0 , + N , + ShapeType::N5 , + ShapeType::N6 > type ; +}; + +template< class ShapeType , unsigned N > +struct ShapeInsert< ShapeType , N , 6 > +{ + typedef Shape< ShapeType::scalar_size , + ShapeType::rank + 1 , + 0 , + 0 , + 0 , + 0 , + 0 , + 0 , + N , + ShapeType::N6 > type ; +}; + +template< class ShapeType , unsigned N > +struct ShapeInsert< ShapeType , N , 7 > +{ + typedef Shape< ShapeType::scalar_size , + ShapeType::rank + 1 , + 0 , + 0 , + 0 , + 0 , + 0 , + 0 , + 0 , + N > type ; +}; + +//---------------------------------------------------------------------------- + +template< class DstShape , class SrcShape , + unsigned DstRankDynamic = DstShape::rank_dynamic , + bool DstRankDynamicOK = unsigned(DstShape::rank_dynamic) >= unsigned(SrcShape::rank_dynamic) > +struct ShapeCompatible { enum { value = false }; }; + +template< class DstShape , class SrcShape > +struct ShapeCompatible< DstShape , SrcShape , 8 , true > +{ + enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) }; +}; + +template< class DstShape , class SrcShape > +struct ShapeCompatible< DstShape , SrcShape , 7 , true > +{ + enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) && + unsigned(DstShape::N7) == unsigned(SrcShape::N7) }; +}; + +template< class DstShape , class SrcShape > +struct ShapeCompatible< DstShape , SrcShape , 6 , true > +{ + enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) && + unsigned(DstShape::N6) == unsigned(SrcShape::N6) && + unsigned(DstShape::N7) == unsigned(SrcShape::N7) }; +}; + +template< class DstShape , class SrcShape > +struct ShapeCompatible< DstShape , SrcShape , 5 , true > +{ + enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) && + unsigned(DstShape::N5) == unsigned(SrcShape::N5) && + unsigned(DstShape::N6) == unsigned(SrcShape::N6) && + unsigned(DstShape::N7) == unsigned(SrcShape::N7) }; +}; + +template< class DstShape , class SrcShape > +struct ShapeCompatible< DstShape , SrcShape , 4 , true > +{ + enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) && + unsigned(DstShape::N4) == unsigned(SrcShape::N4) && + unsigned(DstShape::N5) == unsigned(SrcShape::N5) && + unsigned(DstShape::N6) == unsigned(SrcShape::N6) && + unsigned(DstShape::N7) == unsigned(SrcShape::N7) }; +}; + +template< class DstShape , class SrcShape > +struct ShapeCompatible< DstShape , SrcShape , 3 , true > +{ + enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) && + unsigned(DstShape::N3) == unsigned(SrcShape::N3) && + unsigned(DstShape::N4) == unsigned(SrcShape::N4) && + unsigned(DstShape::N5) == unsigned(SrcShape::N5) && + unsigned(DstShape::N6) == unsigned(SrcShape::N6) && + unsigned(DstShape::N7) == unsigned(SrcShape::N7) }; +}; + +template< class DstShape , class SrcShape > +struct ShapeCompatible< DstShape , SrcShape , 2 , true > +{ + enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) && + unsigned(DstShape::N2) == unsigned(SrcShape::N2) && + unsigned(DstShape::N3) == unsigned(SrcShape::N3) && + unsigned(DstShape::N4) == unsigned(SrcShape::N4) && + unsigned(DstShape::N5) == unsigned(SrcShape::N5) && + unsigned(DstShape::N6) == unsigned(SrcShape::N6) && + unsigned(DstShape::N7) == unsigned(SrcShape::N7) }; +}; + +template< class DstShape , class SrcShape > +struct ShapeCompatible< DstShape , SrcShape , 1 , true > +{ + enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) && + unsigned(DstShape::N1) == unsigned(SrcShape::N1) && + unsigned(DstShape::N2) == unsigned(SrcShape::N2) && + unsigned(DstShape::N3) == unsigned(SrcShape::N3) && + unsigned(DstShape::N4) == unsigned(SrcShape::N4) && + unsigned(DstShape::N5) == unsigned(SrcShape::N5) && + unsigned(DstShape::N6) == unsigned(SrcShape::N6) && + unsigned(DstShape::N7) == unsigned(SrcShape::N7) }; +}; + +template< class DstShape , class SrcShape > +struct ShapeCompatible< DstShape , SrcShape , 0 , true > +{ + enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) && + unsigned(DstShape::N0) == unsigned(SrcShape::N0) && + unsigned(DstShape::N1) == unsigned(SrcShape::N1) && + unsigned(DstShape::N2) == unsigned(SrcShape::N2) && + unsigned(DstShape::N3) == unsigned(SrcShape::N3) && + unsigned(DstShape::N4) == unsigned(SrcShape::N4) && + unsigned(DstShape::N5) == unsigned(SrcShape::N5) && + unsigned(DstShape::N6) == unsigned(SrcShape::N6) && + unsigned(DstShape::N7) == unsigned(SrcShape::N7) }; +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< unsigned ScalarSize , unsigned Rank , + unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 , + unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 , + typename iType > +KOKKOS_INLINE_FUNCTION +size_t dimension( + const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape , + const iType & r ) +{ + return 0 == r ? shape.N0 : ( + 1 == r ? shape.N1 : ( + 2 == r ? shape.N2 : ( + 3 == r ? shape.N3 : ( + 4 == r ? shape.N4 : ( + 5 == r ? shape.N5 : ( + 6 == r ? shape.N6 : ( + 7 == r ? shape.N7 : 1 ))))))); +} + +template< unsigned ScalarSize , unsigned Rank , + unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 , + unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 > +KOKKOS_INLINE_FUNCTION +size_t cardinality_count( + const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape ) +{ + return size_t(shape.N0) * shape.N1 * shape.N2 * shape.N3 * + shape.N4 * shape.N5 * shape.N6 * shape.N7 ; +} + +//---------------------------------------------------------------------------- + +} /* namespace Impl */ +} /* namespace Kokkos */ + +#endif /* #ifndef KOKKOS_CORESHAPE_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_Singleton.hpp b/lib/kokkos/core/src/impl/Kokkos_Singleton.hpp new file mode 100755 index 0000000000000000000000000000000000000000..86bc94ab0be9e8cfd00ea5a95cebc906bd3aa312 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Singleton.hpp @@ -0,0 +1,55 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SINGLETON_HPP +#define KOKKOS_SINGLETON_HPP + +#include <Kokkos_Macros.hpp> +#include <cstddef> + +namespace Kokkos { namespace Impl { + + +}} // namespace Kokkos::Impl + +#endif // KOKKOS_SINGLETON_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp new file mode 100755 index 0000000000000000000000000000000000000000..25e2ec9dc1849db862d9cb0d01bfd817c584b3b8 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp @@ -0,0 +1,79 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_STATICASSERT_HPP +#define KOKKOS_STATICASSERT_HPP + +namespace Kokkos { +namespace Impl { + +template < bool , class T = void > +struct StaticAssert ; + +template< class T > +struct StaticAssert< true , T > { + typedef T type ; + static const bool value = true ; +}; + +template < class A , class B > +struct StaticAssertSame ; + +template < class A > +struct StaticAssertSame<A,A> { typedef A type ; }; + +template < class A , class B > +struct StaticAssertAssignable ; + +template < class A > +struct StaticAssertAssignable<A,A> { typedef A type ; }; + +template < class A > +struct StaticAssertAssignable< const A , A > { typedef const A type ; }; + +} // namespace Impl +} // namespace Kokkos + +#endif /* KOKKOS_STATICASSERT_HPP */ + + diff --git a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp new file mode 100755 index 0000000000000000000000000000000000000000..4885d37376e029e11aa8a67dd8ce8ef8f5c2ba7e --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp @@ -0,0 +1,156 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TAGS_HPP +#define KOKKOS_TAGS_HPP + +#include <impl/Kokkos_Traits.hpp> +#include <Kokkos_Core_fwd.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +//---------------------------------------------------------------------------- + +template<class ExecutionSpace, class MemorySpace> +struct Device { + typedef ExecutionSpace execution_space; + typedef MemorySpace memory_space; + typedef Device<execution_space,memory_space> device_type; +}; +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class C , class Enable = void > +struct is_memory_space : public bool_< false > {}; + +template< class C , class Enable = void > +struct is_execution_space : public bool_< false > {}; + +template< class C , class Enable = void > +struct is_execution_policy : public bool_< false > {}; + +template< class C , class Enable = void > +struct is_array_layout : public Impl::false_type {}; + +template< class C , class Enable = void > +struct is_memory_traits : public Impl::false_type {}; + + +template< class C > +struct is_memory_space< C , typename Impl::enable_if_type< typename C::memory_space >::type > + : public bool_< Impl::is_same< C , typename C::memory_space >::value > {}; + +template< class C > +struct is_execution_space< C , typename Impl::enable_if_type< typename C::execution_space >::type > + : public bool_< Impl::is_same< C , typename C::execution_space >::value > {}; + +template< class C > +struct is_execution_policy< C , typename Impl::enable_if_type< typename C::execution_policy >::type > + : public bool_< Impl::is_same< C , typename C::execution_policy >::value > {}; + +template< class C > +struct is_array_layout< C , typename Impl::enable_if_type< typename C::array_layout >::type > + : public bool_< Impl::is_same< C , typename C::array_layout >::value > {}; + +template< class C > +struct is_memory_traits< C , typename Impl::enable_if_type< typename C::memory_traits >::type > + : public bool_< Impl::is_same< C , typename C::memory_traits >::value > {}; + + +//---------------------------------------------------------------------------- + +template< class C , class Enable = void > +struct is_space : public Impl::false_type {}; + +template< class C > +struct is_space< C + , typename Impl::enable_if<( + Impl::is_same< C , typename C::execution_space >::value || + Impl::is_same< C , typename C::memory_space >::value || + Impl::is_same< C , Device< + typename C::execution_space, + typename C::memory_space> >::value + )>::type + > + : public Impl::true_type +{ + typedef typename C::execution_space execution_space ; + typedef typename C::memory_space memory_space ; + + // The host_memory_space defines a space with host-resident memory. + // If the execution space's memory space is host accessible then use that execution space. + // else use the HostSpace. + typedef + typename Impl::if_c< Impl::is_same< memory_space , HostSpace >::value +#ifdef KOKKOS_HAVE_CUDA + || Impl::is_same< memory_space , CudaUVMSpace>::value + || Impl::is_same< memory_space , CudaHostPinnedSpace>::value +#endif + , memory_space , HostSpace >::type + host_memory_space ; + + // The host_execution_space defines a space which has access to HostSpace. + // If the execution space can access HostSpace then use that execution space. + // else use the DefaultHostExecutionSpace. +#ifdef KOKKOS_HAVE_CUDA + typedef + typename Impl::if_c< Impl::is_same< execution_space , Cuda >::value + , DefaultHostExecutionSpace , execution_space >::type + host_execution_space ; +#else + typedef execution_space host_execution_space; +#endif + + typedef Device<host_execution_space,host_memory_space> host_mirror_space; +}; +} +} + +#endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp new file mode 100755 index 0000000000000000000000000000000000000000..80a326f0802d36e6092d96d0608c13353cc50ddb --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp @@ -0,0 +1,115 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPLWALLTIME_HPP +#define KOKKOS_IMPLWALLTIME_HPP + +#include <stddef.h> + +#ifdef _MSC_VER +#undef KOKKOS_USE_LIBRT +#include <gettimeofday.c> +#else +#ifdef KOKKOS_USE_LIBRT +#include <ctime> +#else +#include <sys/time.h> +#endif +#endif + +namespace Kokkos { +namespace Impl { + +/** \brief Time since construction */ + +class Timer { +private: + #ifdef KOKKOS_USE_LIBRT + struct timespec m_old; + #else + struct timeval m_old ; + #endif + Timer( const Timer & ); + Timer & operator = ( const Timer & ); +public: + + inline + void reset() { + #ifdef KOKKOS_USE_LIBRT + clock_gettime(CLOCK_REALTIME, &m_old); + #else + gettimeofday( & m_old , ((struct timezone *) NULL ) ); + #endif + } + + inline + ~Timer() {} + + inline + Timer() { reset(); } + + inline + double seconds() const + { + #ifdef KOKKOS_USE_LIBRT + struct timespec m_new; + clock_gettime(CLOCK_REALTIME, &m_new); + + return ( (double) ( m_new.tv_sec - m_old.tv_sec ) ) + + ( (double) ( m_new.tv_nsec - m_old.tv_nsec ) * 1.0e-9 ); + #else + struct timeval m_new ; + + ::gettimeofday( & m_new , ((struct timezone *) NULL ) ); + + return ( (double) ( m_new.tv_sec - m_old.tv_sec ) ) + + ( (double) ( m_new.tv_usec - m_old.tv_usec ) * 1.0e-6 ); + #endif + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp new file mode 100755 index 0000000000000000000000000000000000000000..52358842f54f3dd3ce6f19e971a7c71d02488499 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp @@ -0,0 +1,370 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSTRAITS_HPP +#define KOKKOSTRAITS_HPP + +#include <stddef.h> +#include <stdint.h> +#include <Kokkos_Macros.hpp> + +namespace Kokkos { +namespace Impl { + +/* C++11 conformal compile-time type traits utilities. + * Prefer to use C++11 when portably available. + */ +//---------------------------------------------------------------------------- +// C++11 Helpers: + +template < class T , T v > +struct integral_constant +{ + // Declaration of 'static const' causes an unresolved linker symbol in debug + // static const T value = v ; + enum { value = T(v) }; + typedef T value_type; + typedef integral_constant<T,v> type; + KOKKOS_INLINE_FUNCTION operator T() { return v ; } +}; + +typedef integral_constant<bool,false> false_type ; +typedef integral_constant<bool,true> true_type ; + +//---------------------------------------------------------------------------- +// C++11 Type relationships: + +template< class X , class Y > struct is_same : public false_type {}; +template< class X > struct is_same<X,X> : public true_type {}; + +//---------------------------------------------------------------------------- +// C++11 Type properties: + +template <typename T> struct is_const : public false_type {}; +template <typename T> struct is_const<const T> : public true_type {}; +template <typename T> struct is_const<const T & > : public true_type {}; + +template <typename T> struct is_array : public false_type {}; +template <typename T> struct is_array< T[] > : public true_type {}; +template <typename T, unsigned N > struct is_array< T[N] > : public true_type {}; + +//---------------------------------------------------------------------------- +// C++11 Type transformations: + +template <typename T> struct remove_const { typedef T type; }; +template <typename T> struct remove_const<const T> { typedef T type; }; +template <typename T> struct remove_const<const T & > { typedef T & type; }; + +template <typename T> struct add_const { typedef const T type; }; +template <typename T> struct add_const<T & > { typedef const T & type; }; +template <typename T> struct add_const<const T> { typedef const T type; }; +template <typename T> struct add_const<const T & > { typedef const T & type; }; + +template <typename T> struct remove_reference { typedef T type ; }; +template <typename T> struct remove_reference< T & > { typedef T type ; }; +template <typename T> struct remove_reference< const T & > { typedef const T type ; }; + +template <typename T> struct remove_extent { typedef T type ; }; +template <typename T> struct remove_extent<T[]> { typedef T type ; }; +template <typename T, unsigned N > struct remove_extent<T[N]> { typedef T type ; }; + +//---------------------------------------------------------------------------- +// C++11 Other type generators: + +template< bool , class T , class F > +struct condition { typedef F type ; }; + +template< class T , class F > +struct condition<true,T,F> { typedef T type ; }; + +template< bool , class = void > +struct enable_if ; + +template< class T > +struct enable_if< true , T > { typedef T type ; }; + +//---------------------------------------------------------------------------- + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// Other traits + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- + +template< class , class T = void > +struct enable_if_type { typedef T type ; }; + +//---------------------------------------------------------------------------- + +template< bool B > +struct bool_ : public integral_constant<bool,B> {}; + +template< unsigned I > +struct unsigned_ : public integral_constant<unsigned,I> {}; + +template< int I > +struct int_ : public integral_constant<int,I> {}; + +typedef bool_<true> true_; +typedef bool_<false> false_; +//---------------------------------------------------------------------------- +// if_ + +template < bool Cond , typename TrueType , typename FalseType> +struct if_c +{ + enum { value = Cond }; + + typedef FalseType type; + + + typedef typename remove_const< + typename remove_reference<type>::type >::type value_type ; + + typedef typename add_const<value_type>::type const_value_type ; + + static KOKKOS_INLINE_FUNCTION + const_value_type & select( const_value_type & v ) { return v ; } + + static KOKKOS_INLINE_FUNCTION + value_type & select( value_type & v ) { return v ; } + + template< class T > + static KOKKOS_INLINE_FUNCTION + value_type & select( const T & ) { value_type * ptr(0); return *ptr ; } + + + template< class T > + static KOKKOS_INLINE_FUNCTION + const_value_type & select( const T & , const_value_type & v ) { return v ; } + + template< class T > + static KOKKOS_INLINE_FUNCTION + value_type & select( const T & , value_type & v ) { return v ; } +}; + +template <typename TrueType, typename FalseType> +struct if_c< true , TrueType , FalseType > +{ + enum { value = true }; + + typedef TrueType type; + + + typedef typename remove_const< + typename remove_reference<type>::type >::type value_type ; + + typedef typename add_const<value_type>::type const_value_type ; + + static KOKKOS_INLINE_FUNCTION + const_value_type & select( const_value_type & v ) { return v ; } + + static KOKKOS_INLINE_FUNCTION + value_type & select( value_type & v ) { return v ; } + + template< class T > + static KOKKOS_INLINE_FUNCTION + value_type & select( const T & ) { value_type * ptr(0); return *ptr ; } + + + template< class F > + static KOKKOS_INLINE_FUNCTION + const_value_type & select( const_value_type & v , const F & ) { return v ; } + + template< class F > + static KOKKOS_INLINE_FUNCTION + value_type & select( value_type & v , const F & ) { return v ; } +}; + +template< typename TrueType > +struct if_c< false , TrueType , void > +{ + enum { value = false }; + + typedef void type ; + typedef void value_type ; +}; + +template< typename FalseType > +struct if_c< true , void , FalseType > +{ + enum { value = true }; + + typedef void type ; + typedef void value_type ; +}; + +template <typename Cond, typename TrueType, typename FalseType> +struct if_ : public if_c<Cond::value, TrueType, FalseType> {}; + +//---------------------------------------------------------------------------- + +// Allows aliased types: +template< typename T > +struct is_integral : public integral_constant< bool , + ( + Impl::is_same< T , char >::value || + Impl::is_same< T , unsigned char >::value || + Impl::is_same< T , short int >::value || + Impl::is_same< T , unsigned short int >::value || + Impl::is_same< T , int >::value || + Impl::is_same< T , unsigned int >::value || + Impl::is_same< T , long int >::value || + Impl::is_same< T , unsigned long int >::value || + Impl::is_same< T , long long int >::value || + Impl::is_same< T , unsigned long long int >::value || + + Impl::is_same< T , int8_t >::value || + Impl::is_same< T , int16_t >::value || + Impl::is_same< T , int32_t >::value || + Impl::is_same< T , int64_t >::value || + Impl::is_same< T , uint8_t >::value || + Impl::is_same< T , uint16_t >::value || + Impl::is_same< T , uint32_t >::value || + Impl::is_same< T , uint64_t >::value + )> +{}; + +//---------------------------------------------------------------------------- + + +template < size_t N > +struct is_power_of_two +{ + enum type { value = (N > 0) && !(N & (N-1)) }; +}; + +template < size_t N , bool OK = is_power_of_two<N>::value > +struct power_of_two ; + +template < size_t N > +struct power_of_two<N,true> +{ + enum type { value = 1+ power_of_two<(N>>1),true>::value }; +}; + +template <> +struct power_of_two<2,true> +{ + enum type { value = 1 }; +}; + +template <> +struct power_of_two<1,true> +{ + enum type { value = 0 }; +}; + +/** \brief If power of two then return power, + * otherwise return ~0u. + */ +static KOKKOS_FORCEINLINE_FUNCTION +unsigned power_of_two_if_valid( const unsigned N ) +{ + unsigned p = ~0u ; + if ( N && ! ( N & ( N - 1 ) ) ) { +#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA ) + p = __ffs(N) - 1 ; +#elif defined( __GNUC__ ) || defined( __GNUG__ ) + p = __builtin_ffs(N) - 1 ; +#elif defined( __INTEL_COMPILER ) + p = _bit_scan_forward(N); +#else + p = 0 ; + for ( unsigned j = 1 ; ! ( N & j ) ; j <<= 1 ) { ++p ; } +#endif + } + return p ; +} + +//---------------------------------------------------------------------------- + +template< typename T , T v , bool NonZero = ( v != T(0) ) > +struct integral_nonzero_constant +{ + // Declaration of 'static const' causes an unresolved linker symbol in debug + // static const T value = v ; + enum { value = T(v) }; + typedef T value_type ; + typedef integral_nonzero_constant<T,v> type ; + KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & ) {} +}; + +template< typename T , T zero > +struct integral_nonzero_constant<T,zero,false> +{ + const T value ; + typedef T value_type ; + typedef integral_nonzero_constant<T,0> type ; + KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & v ) : value(v) {} +}; + +//---------------------------------------------------------------------------- + +template < class C > struct is_integral_constant : public false_ +{ + typedef void integral_type ; + enum { integral_value = 0 }; +}; + +template < typename T , T v > +struct is_integral_constant< integral_constant<T,v> > : public true_ +{ + typedef T integral_type ; + enum { integral_value = v }; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOSTRAITS_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp new file mode 100755 index 0000000000000000000000000000000000000000..8334af3a3c88285e4121e71d0c8164a8ad277b17 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp @@ -0,0 +1,878 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_VIEWDEFAULT_HPP +#define KOKKOS_VIEWDEFAULT_HPP + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template<> +struct ViewAssignment< ViewDefault , ViewDefault , void > +{ + typedef ViewDefault Specialize ; + + //------------------------------------ + /** \brief Compatible value and shape and LayoutLeft/Right to LayoutStride*/ + + template< class DT , class DL , class DD , class DM , + class ST , class SL , class SD , class SM > + KOKKOS_INLINE_FUNCTION + ViewAssignment( View<DT,DL,DD,DM,Specialize> & dst , + const View<ST,SL,SD,SM,Specialize> & src , + const typename enable_if<( + ViewAssignable< ViewTraits<DT,DL,DD,DM> , + ViewTraits<ST,SL,SD,SM> >::value + || + ( ViewAssignable< ViewTraits<DT,DL,DD,DM> , + ViewTraits<ST,SL,SD,SM> >::assignable_value + && + ShapeCompatible< typename ViewTraits<DT,DL,DD,DM>::shape_type , + typename ViewTraits<ST,SL,SD,SM>::shape_type >::value + && + is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout,LayoutStride>::value + && (is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout,LayoutLeft>::value || + is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout,LayoutRight>::value)) + )>::type * = 0 ) + { + dst.m_offset_map.assign( src.m_offset_map ); + + dst.m_management = src.m_management ; + + dst.m_ptr_on_device = ViewDataManagement< ViewTraits<DT,DL,DD,DM> >::create_handle( src.m_ptr_on_device, src.m_tracker ); + + dst.m_tracker = src.m_tracker ; + + } + + + /** \brief Assign 1D Strided View to LayoutLeft or LayoutRight if stride[0]==1 */ + + template< class DT , class DL , class DD , class DM , + class ST , class SD , class SM > + KOKKOS_INLINE_FUNCTION + ViewAssignment( View<DT,DL,DD,DM,Specialize> & dst , + const View<ST,LayoutStride,SD,SM,Specialize> & src , + const typename enable_if<( + ( + ViewAssignable< ViewTraits<DT,DL,DD,DM> , + ViewTraits<ST,LayoutStride,SD,SM> >::value + || + ( ViewAssignable< ViewTraits<DT,DL,DD,DM> , + ViewTraits<ST,LayoutStride,SD,SM> >::assignable_value + && + ShapeCompatible< typename ViewTraits<DT,DL,DD,DM>::shape_type , + typename ViewTraits<ST,LayoutStride,SD,SM>::shape_type >::value + ) + ) + && + (View<DT,DL,DD,DM,Specialize>::rank==1) + && (is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout,LayoutLeft>::value || + is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout,LayoutRight>::value) + )>::type * = 0 ) + { + size_t strides[8]; + src.stride(strides); + if(strides[0]!=1) { + abort("Trying to assign strided 1D View to LayoutRight or LayoutLeft which is not stride-1"); + } + dst.m_offset_map.assign( src.dimension_0(), 0, 0, 0, 0, 0, 0, 0, 0 ); + + dst.m_management = src.m_management ; + + dst.m_ptr_on_device = ViewDataManagement< ViewTraits<DT,DL,DD,DM> >::create_handle( src.m_ptr_on_device, src.m_tracker ); + + dst.m_tracker = src.m_tracker ; + + } + + //------------------------------------ + /** \brief Deep copy data from compatible value type, layout, rank, and specialization. + * Check the dimensions and allocation lengths at runtime. + */ + template< class DT , class DL , class DD , class DM , + class ST , class SL , class SD , class SM > + inline static + void deep_copy( const View<DT,DL,DD,DM,Specialize> & dst , + const View<ST,SL,SD,SM,Specialize> & src , + const typename Impl::enable_if<( + Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::value_type , + typename ViewTraits<ST,SL,SD,SM>::non_const_value_type >::value + && + Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout , + typename ViewTraits<ST,SL,SD,SM>::array_layout >::value + && + ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) == unsigned(ViewTraits<ST,SL,SD,SM>::rank) ) + )>::type * = 0 ) + { + typedef typename ViewTraits<DT,DL,DD,DM>::memory_space dst_memory_space ; + typedef typename ViewTraits<ST,SL,SD,SM>::memory_space src_memory_space ; + + if ( dst.ptr_on_device() != src.ptr_on_device() ) { + + Impl::assert_shapes_are_equal( dst.m_offset_map , src.m_offset_map ); + + const size_t nbytes = dst.m_offset_map.scalar_size * dst.m_offset_map.capacity(); + + DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes ); + } + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class ExecSpace , class DT , class DL, class DD, class DM, class DS > +struct ViewDefaultConstruct< ExecSpace , Kokkos::View<DT,DL,DD,DM,DS> , true > +{ + Kokkos::View<DT,DL,DD,DM,DS> * const m_ptr ; + + KOKKOS_FORCEINLINE_FUNCTION + void operator()( const typename ExecSpace::size_type& i ) const + { new(m_ptr+i) Kokkos::View<DT,DL,DD,DM,DS>(); } + + ViewDefaultConstruct( Kokkos::View<DT,DL,DD,DM,DS> * pointer , size_t capacity ) + : m_ptr( pointer ) + { + Kokkos::RangePolicy< ExecSpace > range( 0 , capacity ); + parallel_for( range , *this ); + ExecSpace::fence(); + } +}; + +template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type + > +struct ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > + , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type + , SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type > +{ +private: + + typedef View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > SrcViewType ; + + enum { V0 = Impl::is_same< SubArg0_type , void >::value ? 1 : 0 }; + enum { V1 = Impl::is_same< SubArg1_type , void >::value ? 1 : 0 }; + enum { V2 = Impl::is_same< SubArg2_type , void >::value ? 1 : 0 }; + enum { V3 = Impl::is_same< SubArg3_type , void >::value ? 1 : 0 }; + enum { V4 = Impl::is_same< SubArg4_type , void >::value ? 1 : 0 }; + enum { V5 = Impl::is_same< SubArg5_type , void >::value ? 1 : 0 }; + enum { V6 = Impl::is_same< SubArg6_type , void >::value ? 1 : 0 }; + enum { V7 = Impl::is_same< SubArg7_type , void >::value ? 1 : 0 }; + + // The source view rank must be equal to the input argument rank + // Once a void argument is encountered all subsequent arguments must be void. + enum { InputRank = + Impl::StaticAssert<( SrcViewType::rank == + ( V0 ? 0 : ( + V1 ? 1 : ( + V2 ? 2 : ( + V3 ? 3 : ( + V4 ? 4 : ( + V5 ? 5 : ( + V6 ? 6 : ( + V7 ? 7 : 8 ))))))) )) + && + ( SrcViewType::rank == + ( 8 - ( V0 + V1 + V2 + V3 + V4 + V5 + V6 + V7 ) ) ) + >::value ? SrcViewType::rank : 0 }; + + enum { R0 = Impl::ViewOffsetRange< SubArg0_type >::is_range ? 1 : 0 }; + enum { R1 = Impl::ViewOffsetRange< SubArg1_type >::is_range ? 1 : 0 }; + enum { R2 = Impl::ViewOffsetRange< SubArg2_type >::is_range ? 1 : 0 }; + enum { R3 = Impl::ViewOffsetRange< SubArg3_type >::is_range ? 1 : 0 }; + enum { R4 = Impl::ViewOffsetRange< SubArg4_type >::is_range ? 1 : 0 }; + enum { R5 = Impl::ViewOffsetRange< SubArg5_type >::is_range ? 1 : 0 }; + enum { R6 = Impl::ViewOffsetRange< SubArg6_type >::is_range ? 1 : 0 }; + enum { R7 = Impl::ViewOffsetRange< SubArg7_type >::is_range ? 1 : 0 }; + + enum { OutputRank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) }; + + // Reverse + enum { R0_rev = 0 == InputRank ? 0u : ( + 1 == InputRank ? unsigned(R0) : ( + 2 == InputRank ? unsigned(R1) : ( + 3 == InputRank ? unsigned(R2) : ( + 4 == InputRank ? unsigned(R3) : ( + 5 == InputRank ? unsigned(R4) : ( + 6 == InputRank ? unsigned(R5) : ( + 7 == InputRank ? unsigned(R6) : unsigned(R7) ))))))) }; + + typedef typename SrcViewType::array_layout SrcViewLayout ; + + // Choose array layout, attempting to preserve original layout if at all possible. + typedef typename Impl::if_c< + ( // Same Layout IF + // OutputRank 0 + ( OutputRank == 0 ) + || + // OutputRank 1 or 2, InputLayout Left, Interval 0 + // because single stride one or second index has a stride. + ( OutputRank <= 2 && R0 && Impl::is_same<SrcViewLayout,LayoutLeft>::value ) + || + // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1] + // because single stride one or second index has a stride. + ( OutputRank <= 2 && R0_rev && Impl::is_same<SrcViewLayout,LayoutRight>::value ) + ), SrcViewLayout , Kokkos::LayoutStride >::type OutputViewLayout ; + + // Choose data type as a purely dynamic rank array to accomodate a runtime range. + typedef typename Impl::if_c< OutputRank == 0 , typename SrcViewType::value_type , + typename Impl::if_c< OutputRank == 1 , typename SrcViewType::value_type *, + typename Impl::if_c< OutputRank == 2 , typename SrcViewType::value_type **, + typename Impl::if_c< OutputRank == 3 , typename SrcViewType::value_type ***, + typename Impl::if_c< OutputRank == 4 , typename SrcViewType::value_type ****, + typename Impl::if_c< OutputRank == 5 , typename SrcViewType::value_type *****, + typename Impl::if_c< OutputRank == 6 , typename SrcViewType::value_type ******, + typename Impl::if_c< OutputRank == 7 , typename SrcViewType::value_type *******, + typename SrcViewType::value_type ******** + >::type >::type >::type >::type >::type >::type >::type >::type OutputData ; + + // Choose space. + // If the source view's template arg1 or arg2 is a space then use it, + // otherwise use the source view's execution space. + + typedef typename Impl::if_c< Impl::is_space< SrcArg1Type >::value , SrcArg1Type , + typename Impl::if_c< Impl::is_space< SrcArg2Type >::value , SrcArg2Type , typename SrcViewType::device_type + >::type >::type OutputSpace ; + +public: + + // If keeping the layout then match non-data type arguments + // else keep execution space and memory traits. + typedef typename + Impl::if_c< Impl::is_same< SrcViewLayout , OutputViewLayout >::value + , Kokkos::View< OutputData , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > + , Kokkos::View< OutputData , OutputViewLayout , OutputSpace + , typename SrcViewType::memory_traits + , Impl::ViewDefault > + >::type type ; +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +// Construct subview of a Rank 8 view +template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type > +template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type + > +KOKKOS_INLINE_FUNCTION +View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >:: +View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src + , const SubArg0_type & arg0 + , const SubArg1_type & arg1 + , const SubArg2_type & arg2 + , const SubArg3_type & arg3 + , const SubArg4_type & arg4 + , const SubArg5_type & arg5 + , const SubArg6_type & arg6 + , const SubArg7_type & arg7 + ) + : m_ptr_on_device( (typename traits::value_type*) NULL) + , m_offset_map() + , m_management() + , m_tracker() +{ + // This constructor can only be used to construct a subview + // from the source view. This type must match the subview type + // deduced from the source view and subview arguments. + + typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > + , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type + , SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type > + ViewSubviewDeduction ; + + enum { is_a_valid_subview_constructor = + Impl::StaticAssert< + Impl::is_same< View , typename ViewSubviewDeduction::type >::value + >::value + }; + + if ( is_a_valid_subview_constructor ) { + + typedef Impl::ViewOffsetRange< SubArg0_type > R0 ; + typedef Impl::ViewOffsetRange< SubArg1_type > R1 ; + typedef Impl::ViewOffsetRange< SubArg2_type > R2 ; + typedef Impl::ViewOffsetRange< SubArg3_type > R3 ; + typedef Impl::ViewOffsetRange< SubArg4_type > R4 ; + typedef Impl::ViewOffsetRange< SubArg5_type > R5 ; + typedef Impl::ViewOffsetRange< SubArg6_type > R6 ; + typedef Impl::ViewOffsetRange< SubArg7_type > R7 ; + + // 'assign_subview' returns whether the subview offset_map + // introduces noncontiguity in the view. + const bool introduce_noncontiguity = + m_offset_map.assign_subview( src.m_offset_map + , R0::dimension( src.m_offset_map.N0 , arg0 ) + , R1::dimension( src.m_offset_map.N1 , arg1 ) + , R2::dimension( src.m_offset_map.N2 , arg2 ) + , R3::dimension( src.m_offset_map.N3 , arg3 ) + , R4::dimension( src.m_offset_map.N4 , arg4 ) + , R5::dimension( src.m_offset_map.N5 , arg5 ) + , R6::dimension( src.m_offset_map.N6 , arg6 ) + , R7::dimension( src.m_offset_map.N7 , arg7 ) + ); + + if ( m_offset_map.capacity() ) { + + m_management = src.m_management ; + + if ( introduce_noncontiguity ) m_management.set_noncontiguous(); + + m_ptr_on_device = src.m_ptr_on_device + + src.m_offset_map( R0::begin( arg0 ) + , R1::begin( arg1 ) + , R2::begin( arg2 ) + , R3::begin( arg3 ) + , R4::begin( arg4 ) + , R5::begin( arg5 ) + , R6::begin( arg6 ) + , R7::begin( arg7 ) ); + m_tracker = src.m_tracker ; + } + } +} + +// Construct subview of a Rank 7 view +template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type > +template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + , class SubArg4_type , class SubArg5_type , class SubArg6_type + > +KOKKOS_INLINE_FUNCTION +View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >:: +View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src + , const SubArg0_type & arg0 + , const SubArg1_type & arg1 + , const SubArg2_type & arg2 + , const SubArg3_type & arg3 + , const SubArg4_type & arg4 + , const SubArg5_type & arg5 + , const SubArg6_type & arg6 + ) + : m_ptr_on_device( (typename traits::value_type*) NULL) + , m_offset_map() + , m_management() + , m_tracker() +{ + // This constructor can only be used to construct a subview + // from the source view. This type must match the subview type + // deduced from the source view and subview arguments. + + typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > + , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type + , SubArg4_type , SubArg5_type , SubArg6_type , void > + ViewSubviewDeduction ; + + enum { is_a_valid_subview_constructor = + Impl::StaticAssert< + Impl::is_same< View , typename ViewSubviewDeduction::type >::value + >::value + }; + + if ( is_a_valid_subview_constructor ) { + + typedef Impl::ViewOffsetRange< SubArg0_type > R0 ; + typedef Impl::ViewOffsetRange< SubArg1_type > R1 ; + typedef Impl::ViewOffsetRange< SubArg2_type > R2 ; + typedef Impl::ViewOffsetRange< SubArg3_type > R3 ; + typedef Impl::ViewOffsetRange< SubArg4_type > R4 ; + typedef Impl::ViewOffsetRange< SubArg5_type > R5 ; + typedef Impl::ViewOffsetRange< SubArg6_type > R6 ; + + // 'assign_subview' returns whether the subview offset_map + // introduces noncontiguity in the view. + const bool introduce_noncontiguity = + m_offset_map.assign_subview( src.m_offset_map + , R0::dimension( src.m_offset_map.N0 , arg0 ) + , R1::dimension( src.m_offset_map.N1 , arg1 ) + , R2::dimension( src.m_offset_map.N2 , arg2 ) + , R3::dimension( src.m_offset_map.N3 , arg3 ) + , R4::dimension( src.m_offset_map.N4 , arg4 ) + , R5::dimension( src.m_offset_map.N5 , arg5 ) + , R6::dimension( src.m_offset_map.N6 , arg6 ) + , 0 + ); + + if ( m_offset_map.capacity() ) { + + m_management = src.m_management ; + + if ( introduce_noncontiguity ) m_management.set_noncontiguous(); + + m_ptr_on_device = src.m_ptr_on_device + + src.m_offset_map( R0::begin( arg0 ) + , R1::begin( arg1 ) + , R2::begin( arg2 ) + , R3::begin( arg3 ) + , R4::begin( arg4 ) + , R5::begin( arg5 ) + , R6::begin( arg6 ) + ); + m_tracker = src.m_tracker ; + } + } +} + +// Construct subview of a Rank 6 view +template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type > +template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + , class SubArg4_type , class SubArg5_type + > +KOKKOS_INLINE_FUNCTION +View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >:: +View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src + , const SubArg0_type & arg0 + , const SubArg1_type & arg1 + , const SubArg2_type & arg2 + , const SubArg3_type & arg3 + , const SubArg4_type & arg4 + , const SubArg5_type & arg5 + ) + : m_ptr_on_device( (typename traits::value_type*) NULL) + , m_offset_map() + , m_management() + , m_tracker() +{ + // This constructor can only be used to construct a subview + // from the source view. This type must match the subview type + // deduced from the source view and subview arguments. + + typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > + , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type + , SubArg4_type , SubArg5_type , void , void > + ViewSubviewDeduction ; + + enum { is_a_valid_subview_constructor = + Impl::StaticAssert< + Impl::is_same< View , typename ViewSubviewDeduction::type >::value + >::value + }; + + if ( is_a_valid_subview_constructor ) { + + typedef Impl::ViewOffsetRange< SubArg0_type > R0 ; + typedef Impl::ViewOffsetRange< SubArg1_type > R1 ; + typedef Impl::ViewOffsetRange< SubArg2_type > R2 ; + typedef Impl::ViewOffsetRange< SubArg3_type > R3 ; + typedef Impl::ViewOffsetRange< SubArg4_type > R4 ; + typedef Impl::ViewOffsetRange< SubArg5_type > R5 ; + + // 'assign_subview' returns whether the subview offset_map + // introduces noncontiguity in the view. + const bool introduce_noncontiguity = + m_offset_map.assign_subview( src.m_offset_map + , R0::dimension( src.m_offset_map.N0 , arg0 ) + , R1::dimension( src.m_offset_map.N1 , arg1 ) + , R2::dimension( src.m_offset_map.N2 , arg2 ) + , R3::dimension( src.m_offset_map.N3 , arg3 ) + , R4::dimension( src.m_offset_map.N4 , arg4 ) + , R5::dimension( src.m_offset_map.N5 , arg5 ) + , 0 + , 0 + ); + + if ( m_offset_map.capacity() ) { + + m_management = src.m_management ; + + if ( introduce_noncontiguity ) m_management.set_noncontiguous(); + + m_ptr_on_device = src.m_ptr_on_device + + src.m_offset_map( R0::begin( arg0 ) + , R1::begin( arg1 ) + , R2::begin( arg2 ) + , R3::begin( arg3 ) + , R4::begin( arg4 ) + , R5::begin( arg5 ) + ); + m_tracker = src.m_tracker ; + } + } +} + +// Construct subview of a Rank 5 view +template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type > +template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + , class SubArg4_type + > +KOKKOS_INLINE_FUNCTION +View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >:: +View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src + , const SubArg0_type & arg0 + , const SubArg1_type & arg1 + , const SubArg2_type & arg2 + , const SubArg3_type & arg3 + , const SubArg4_type & arg4 + ) + : m_ptr_on_device( (typename traits::value_type*) NULL) + , m_offset_map() + , m_management() + , m_tracker() +{ + // This constructor can only be used to construct a subview + // from the source view. This type must match the subview type + // deduced from the source view and subview arguments. + + typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > + , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type + , SubArg4_type , void , void , void > + ViewSubviewDeduction ; + + enum { is_a_valid_subview_constructor = + Impl::StaticAssert< + Impl::is_same< View , typename ViewSubviewDeduction::type >::value + >::value + }; + + if ( is_a_valid_subview_constructor ) { + + typedef Impl::ViewOffsetRange< SubArg0_type > R0 ; + typedef Impl::ViewOffsetRange< SubArg1_type > R1 ; + typedef Impl::ViewOffsetRange< SubArg2_type > R2 ; + typedef Impl::ViewOffsetRange< SubArg3_type > R3 ; + typedef Impl::ViewOffsetRange< SubArg4_type > R4 ; + + // 'assign_subview' returns whether the subview offset_map + // introduces noncontiguity in the view. + const bool introduce_noncontiguity = + m_offset_map.assign_subview( src.m_offset_map + , R0::dimension( src.m_offset_map.N0 , arg0 ) + , R1::dimension( src.m_offset_map.N1 , arg1 ) + , R2::dimension( src.m_offset_map.N2 , arg2 ) + , R3::dimension( src.m_offset_map.N3 , arg3 ) + , R4::dimension( src.m_offset_map.N4 , arg4 ) + , 0 + , 0 + , 0 + ); + + if ( m_offset_map.capacity() ) { + + m_management = src.m_management ; + + if ( introduce_noncontiguity ) m_management.set_noncontiguous(); + + m_ptr_on_device = src.m_ptr_on_device + + src.m_offset_map( R0::begin( arg0 ) + , R1::begin( arg1 ) + , R2::begin( arg2 ) + , R3::begin( arg3 ) + , R4::begin( arg4 ) + ); + m_tracker = src.m_tracker ; + } + } +} + +// Construct subview of a Rank 4 view +template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type > +template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type + , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type + > +KOKKOS_INLINE_FUNCTION +View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >:: +View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src + , const SubArg0_type & arg0 + , const SubArg1_type & arg1 + , const SubArg2_type & arg2 + , const SubArg3_type & arg3 + ) + : m_ptr_on_device( (typename traits::value_type*) NULL) + , m_offset_map() + , m_management() + , m_tracker() +{ + // This constructor can only be used to construct a subview + // from the source view. This type must match the subview type + // deduced from the source view and subview arguments. + + typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > + , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type + , void , void , void , void > + ViewSubviewDeduction ; + + enum { is_a_valid_subview_constructor = + Impl::StaticAssert< + Impl::is_same< View , typename ViewSubviewDeduction::type >::value + >::value + }; + + if ( is_a_valid_subview_constructor ) { + + typedef Impl::ViewOffsetRange< SubArg0_type > R0 ; + typedef Impl::ViewOffsetRange< SubArg1_type > R1 ; + typedef Impl::ViewOffsetRange< SubArg2_type > R2 ; + typedef Impl::ViewOffsetRange< SubArg3_type > R3 ; + + // 'assign_subview' returns whether the subview offset_map + // introduces noncontiguity in the view. + const bool introduce_noncontiguity = + m_offset_map.assign_subview( src.m_offset_map + , R0::dimension( src.m_offset_map.N0 , arg0 ) + , R1::dimension( src.m_offset_map.N1 , arg1 ) + , R2::dimension( src.m_offset_map.N2 , arg2 ) + , R3::dimension( src.m_offset_map.N3 , arg3 ) + , 0 + , 0 + , 0 + , 0 + ); + + if ( m_offset_map.capacity() ) { + + m_management = src.m_management ; + + if ( introduce_noncontiguity ) m_management.set_noncontiguous(); + + m_ptr_on_device = src.m_ptr_on_device + + src.m_offset_map( R0::begin( arg0 ) + , R1::begin( arg1 ) + , R2::begin( arg2 ) + , R3::begin( arg3 ) + ); + m_tracker = src.m_tracker ; + } + } +} + +// Construct subview of a Rank 3 view +template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type > +template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type + , class SubArg0_type , class SubArg1_type , class SubArg2_type + > +KOKKOS_INLINE_FUNCTION +View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >:: +View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src + , const SubArg0_type & arg0 + , const SubArg1_type & arg1 + , const SubArg2_type & arg2 + ) + : m_ptr_on_device( (typename traits::value_type*) NULL) + , m_offset_map() + , m_management() + , m_tracker() +{ + // This constructor can only be used to construct a subview + // from the source view. This type must match the subview type + // deduced from the source view and subview arguments. + + typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > + , SubArg0_type , SubArg1_type , SubArg2_type , void , void , void , void , void > + ViewSubviewDeduction ; + + enum { is_a_valid_subview_constructor = + Impl::StaticAssert< + Impl::is_same< View , typename ViewSubviewDeduction::type >::value + >::value + }; + + if ( is_a_valid_subview_constructor ) { + + typedef Impl::ViewOffsetRange< SubArg0_type > R0 ; + typedef Impl::ViewOffsetRange< SubArg1_type > R1 ; + typedef Impl::ViewOffsetRange< SubArg2_type > R2 ; + + // 'assign_subview' returns whether the subview offset_map + // introduces noncontiguity in the view. + const bool introduce_noncontiguity = + m_offset_map.assign_subview( src.m_offset_map + , R0::dimension( src.m_offset_map.N0 , arg0 ) + , R1::dimension( src.m_offset_map.N1 , arg1 ) + , R2::dimension( src.m_offset_map.N2 , arg2 ) + , 0 , 0 , 0 , 0 , 0); + + if ( m_offset_map.capacity() ) { + + m_management = src.m_management ; + + if ( introduce_noncontiguity ) m_management.set_noncontiguous(); + + m_ptr_on_device = src.m_ptr_on_device + + src.m_offset_map( R0::begin( arg0 ) + , R1::begin( arg1 ) + , R2::begin( arg2 ) + ); + m_tracker = src.m_tracker ; + } + } +} + +// Construct subview of a Rank 2 view +template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type > +template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type + , class SubArg0_type , class SubArg1_type + > +KOKKOS_INLINE_FUNCTION +View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >:: +View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src + , const SubArg0_type & arg0 + , const SubArg1_type & arg1 + ) + : m_ptr_on_device( (typename traits::value_type*) NULL) + , m_offset_map() + , m_management() + , m_tracker() +{ + // This constructor can only be used to construct a subview + // from the source view. This type must match the subview type + // deduced from the source view and subview arguments. + + typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > + , SubArg0_type , SubArg1_type , void , void , void , void , void , void > + ViewSubviewDeduction ; + + enum { is_a_valid_subview_constructor = + Impl::StaticAssert< + Impl::is_same< View , typename ViewSubviewDeduction::type >::value + >::value + }; + + if ( is_a_valid_subview_constructor ) { + + typedef Impl::ViewOffsetRange< SubArg0_type > R0 ; + typedef Impl::ViewOffsetRange< SubArg1_type > R1 ; + + // 'assign_subview' returns whether the subview offset_map + // introduces noncontiguity in the view. + const bool introduce_noncontiguity = + m_offset_map.assign_subview( src.m_offset_map + , R0::dimension( src.m_offset_map.N0 , arg0 ) + , R1::dimension( src.m_offset_map.N1 , arg1 ) + , 0 , 0 , 0 , 0 , 0 , 0 ); + + if ( m_offset_map.capacity() ) { + + m_management = src.m_management ; + + if ( introduce_noncontiguity ) m_management.set_noncontiguous(); + + m_ptr_on_device = src.m_ptr_on_device + + src.m_offset_map( R0::begin( arg0 ) + , R1::begin( arg1 ) + ); + m_tracker = src.m_tracker ; + } + } +} + +// Construct subview of a Rank 1 view +template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type > +template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type + , class SubArg0_type + > +KOKKOS_INLINE_FUNCTION +View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >:: +View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src + , const SubArg0_type & arg0 + ) + : m_ptr_on_device( (typename traits::value_type*) NULL) + , m_offset_map() + , m_management() + , m_tracker() +{ + // This constructor can only be used to construct a subview + // from the source view. This type must match the subview type + // deduced from the source view and subview arguments. + + typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > + , SubArg0_type , void , void , void , void , void , void , void > + ViewSubviewDeduction ; + + enum { is_a_valid_subview_constructor = + Impl::StaticAssert< + Impl::is_same< View , typename ViewSubviewDeduction::type >::value + >::value + }; + + if ( is_a_valid_subview_constructor ) { + + typedef Impl::ViewOffsetRange< SubArg0_type > R0 ; + + // 'assign_subview' returns whether the subview offset_map + // introduces noncontiguity in the view. + const bool introduce_noncontiguity = + m_offset_map.assign_subview( src.m_offset_map + , R0::dimension( src.m_offset_map.N0 , arg0 ) + , 0 , 0 , 0 , 0 , 0 , 0 , 0 ); + + if ( m_offset_map.capacity() ) { + + m_management = src.m_management ; + + if ( introduce_noncontiguity ) m_management.set_noncontiguous(); + + m_ptr_on_device = src.m_ptr_on_device + + src.m_offset_map( R0::begin( arg0 ) + ); + m_tracker = src.m_tracker ; + } + } +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_VIEWDEFAULT_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp new file mode 100755 index 0000000000000000000000000000000000000000..61cd75844fff32d6189784af773d008c58d1ce4a --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp @@ -0,0 +1,1348 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_VIEWOFFSET_HPP +#define KOKKOS_VIEWOFFSET_HPP + +#include <Kokkos_Pair.hpp> +#include <Kokkos_Layout.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Shape.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +struct ALL ; +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { namespace Impl { + +template < class ShapeType , class LayoutType , typename Enable = void > +struct ViewOffset ; + +//---------------------------------------------------------------------------- +// LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding +template < class ShapeType > +struct ViewOffset< ShapeType , LayoutLeft + , typename enable_if<( 1 >= ShapeType::rank + || + 0 == ShapeType::rank_dynamic + )>::type > + : public ShapeType +{ + typedef size_t size_type ; + typedef ShapeType shape_type ; + typedef LayoutLeft array_layout ; + + enum { has_padding = false }; + + template< unsigned R > + KOKKOS_INLINE_FUNCTION + void assign( size_t n ) + { assign_shape_dimension<R>( *this , n ); } + + // Return whether the subview introduced noncontiguity + template< class S , class L > + KOKKOS_INLINE_FUNCTION + typename Impl::enable_if<( 0 == shape_type::rank && + Impl::is_same<L,LayoutLeft>::value + ), bool >::type + assign_subview( const ViewOffset<S,L,void> & + , const size_t n0 + , const size_t n1 + , const size_t n2 + , const size_t n3 + , const size_t n4 + , const size_t n5 + , const size_t n6 + , const size_t n7 + ) + { + return false ; // did not introduce noncontiguity + } + + // This subview must be 1 == rank and 1 == rank_dynamic. + // The source dimension #0 must be non-zero and all other dimensions are zero. + // Return whether the subview introduced noncontiguity + template< class S , class L > + KOKKOS_INLINE_FUNCTION + typename Impl::enable_if<( 1 == shape_type::rank && + 1 == shape_type::rank_dynamic && + 1 <= S::rank && + Impl::is_same<L,LayoutLeft>::value + ), bool >::type + assign_subview( const ViewOffset<S,L,void> & + , const size_t n0 + , const size_t n1 + , const size_t n2 + , const size_t n3 + , const size_t n4 + , const size_t n5 + , const size_t n6 + , const size_t n7 + ) + { + // n1 .. n7 must be zero + shape_type::N0 = n0 ; + return false ; // did not introduce noncontiguity + } + + + KOKKOS_INLINE_FUNCTION + void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3 + , size_t n4 , size_t n5 , size_t n6 , size_t n7 + , size_t = 0 ) + { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); } + + template< class ShapeRHS > + KOKKOS_INLINE_FUNCTION + void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs + , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank) + && + int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic) + )>::type * = 0 ) + { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); } + + template< class ShapeRHS > + KOKKOS_INLINE_FUNCTION + void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs + , typename enable_if<( 1 == int(ShapeRHS::rank) + && + 1 == int(shape_type::rank) + && + 1 == int(shape_type::rank_dynamic) + )>::type * = 0 ) + { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); } + + KOKKOS_INLINE_FUNCTION + void set_padding() {} + + KOKKOS_INLINE_FUNCTION + size_type cardinality() const + { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; } + + KOKKOS_INLINE_FUNCTION + size_type capacity() const + { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; } + + // Stride with [ rank ] value is the total length + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { + s[0] = 1 ; + if ( 0 < shape_type::rank ) { s[1] = shape_type::N0 ; } + if ( 1 < shape_type::rank ) { s[2] = s[1] * shape_type::N1 ; } + if ( 2 < shape_type::rank ) { s[3] = s[2] * shape_type::N2 ; } + if ( 3 < shape_type::rank ) { s[4] = s[3] * shape_type::N3 ; } + if ( 4 < shape_type::rank ) { s[5] = s[4] * shape_type::N4 ; } + if ( 5 < shape_type::rank ) { s[6] = s[5] * shape_type::N5 ; } + if ( 6 < shape_type::rank ) { s[7] = s[6] * shape_type::N6 ; } + if ( 7 < shape_type::rank ) { s[8] = s[7] * shape_type::N7 ; } + } + + KOKKOS_INLINE_FUNCTION size_type stride_0() const { return 1 ; } + KOKKOS_INLINE_FUNCTION size_type stride_1() const { return shape_type::N0 ; } + KOKKOS_INLINE_FUNCTION size_type stride_2() const { return shape_type::N0 * shape_type::N1 ; } + KOKKOS_INLINE_FUNCTION size_type stride_3() const { return shape_type::N0 * shape_type::N1 * shape_type::N2 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_4() const + { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_5() const + { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_6() const + { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_7() const + { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 ; } + + // rank 1 + template< typename I0 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const & i0 ) const { return i0 ; } + + // rank 2 + template < typename I0 , typename I1 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const & i0 , I1 const & i1 ) const + { return i0 + shape_type::N0 * i1 ; } + + //rank 3 + template <typename I0, typename I1, typename I2> + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0 + , I1 const& i1 + , I2 const& i2 + ) const + { + return i0 + shape_type::N0 * ( + i1 + shape_type::N1 * i2 ); + } + + //rank 4 + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3 ) const + { + return i0 + shape_type::N0 * ( + i1 + shape_type::N1 * ( + i2 + shape_type::N2 * i3 )); + } + + //rank 5 + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4 ) const + { + return i0 + shape_type::N0 * ( + i1 + shape_type::N1 * ( + i2 + shape_type::N2 * ( + i3 + shape_type::N3 * i4 ))); + } + + //rank 6 + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5 ) const + { + return i0 + shape_type::N0 * ( + i1 + shape_type::N1 * ( + i2 + shape_type::N2 * ( + i3 + shape_type::N3 * ( + i4 + shape_type::N4 * i5 )))); + } + + //rank 7 + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5, typename I6 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6) const + { + return i0 + shape_type::N0 * ( + i1 + shape_type::N1 * ( + i2 + shape_type::N2 * ( + i3 + shape_type::N3 * ( + i4 + shape_type::N4 * ( + i5 + shape_type::N5 * i6 ))))); + } + + //rank 8 + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5, typename I6, typename I7 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7) const + { + return i0 + shape_type::N0 * ( + i1 + shape_type::N1 * ( + i2 + shape_type::N2 * ( + i3 + shape_type::N3 * ( + i4 + shape_type::N4 * ( + i5 + shape_type::N5 * ( + i6 + shape_type::N6 * i7 )))))); + } +}; + +//---------------------------------------------------------------------------- +// LayoutLeft AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding +template < class ShapeType > +struct ViewOffset< ShapeType , LayoutLeft + , typename enable_if<( 1 < ShapeType::rank + && + 0 < ShapeType::rank_dynamic + )>::type > + : public ShapeType +{ + typedef size_t size_type ; + typedef ShapeType shape_type ; + typedef LayoutLeft array_layout ; + + enum { has_padding = true }; + + size_type S0 ; + + // This subview must be 2 == rank and 2 == rank_dynamic + // due to only having stride #0. + // The source dimension #0 must be non-zero for stride-one leading dimension. + // At most subsequent dimension can be non-zero. + // Return whether the subview introduced noncontiguity. + template< class S , class L > + KOKKOS_INLINE_FUNCTION + typename Impl::enable_if<( 2 == shape_type::rank && + 2 == shape_type::rank_dynamic && + 2 <= S::rank && + Impl::is_same<L,LayoutLeft>::value + ), bool >::type + assign_subview( const ViewOffset<S,L,void> & rhs + , const size_t n0 + , const size_t n1 + , const size_t n2 + , const size_t n3 + , const size_t n4 + , const size_t n5 + , const size_t n6 + , const size_t n7 + ) + { + // N1 = second non-zero dimension + // S0 = stride for second non-zero dimension + shape_type::N0 = n0 ; + shape_type::N1 = 0 ; + S0 = 0 ; + + if ( n1 ) { shape_type::N1 = n1 ; S0 = rhs.stride_1(); } + else if ( 2 < S::rank && n2 ) { shape_type::N1 = n2 ; S0 = rhs.stride_2(); } + else if ( 3 < S::rank && n3 ) { shape_type::N1 = n3 ; S0 = rhs.stride_3(); } + else if ( 4 < S::rank && n4 ) { shape_type::N1 = n4 ; S0 = rhs.stride_4(); } + else if ( 5 < S::rank && n5 ) { shape_type::N1 = n5 ; S0 = rhs.stride_5(); } + else if ( 6 < S::rank && n6 ) { shape_type::N1 = n6 ; S0 = rhs.stride_6(); } + else if ( 7 < S::rank && n7 ) { shape_type::N1 = n7 ; S0 = rhs.stride_7(); } + + // Introduce noncontiguity if change the first dimension + // or took a range of a dimension after the second. + return ( size_t(shape_type::N0) != size_t(rhs.N0) ) || ( 0 == n1 ); + } + + + template< unsigned R > + KOKKOS_INLINE_FUNCTION + void assign( size_t n ) + { assign_shape_dimension<R>( *this , n ); } + + + KOKKOS_INLINE_FUNCTION + void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3 + , size_t n4 , size_t n5 , size_t n6 , size_t n7 + , size_t = 0 ) + { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); S0 = shape_type::N0 ; } + + template< class ShapeRHS > + KOKKOS_INLINE_FUNCTION + void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs + , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank) + && + int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic) + && + int(ShapeRHS::rank_dynamic) == 0 + )>::type * = 0 ) + { + shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); + S0 = shape_type::N0 ; // No padding when dynamic_rank == 0 + } + + template< class ShapeRHS > + KOKKOS_INLINE_FUNCTION + void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs + , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank) + && + int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic) + && + int(ShapeRHS::rank_dynamic) > 0 + )>::type * = 0 ) + { + shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); + S0 = rhs.S0 ; // possibly padding when dynamic rank > 0 + } + + KOKKOS_INLINE_FUNCTION + void set_padding() + { + enum { div = MEMORY_ALIGNMENT / shape_type::scalar_size }; + enum { mod = MEMORY_ALIGNMENT % shape_type::scalar_size }; + enum { align = 0 == mod ? div : 0 }; + + if ( align && MEMORY_ALIGNMENT_THRESHOLD * align < S0 ) { + + const size_type count_mod = S0 % ( div ? div : 1 ); + + if ( count_mod ) { S0 += align - count_mod ; } + } + } + + KOKKOS_INLINE_FUNCTION + size_type cardinality() const + { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; } + + KOKKOS_INLINE_FUNCTION + size_type capacity() const + { return size_type(S0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; } + + // Stride with [ rank ] as total length + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { + s[0] = 1 ; + if ( 0 < shape_type::rank ) { s[1] = S0 ; } + if ( 1 < shape_type::rank ) { s[2] = s[1] * shape_type::N1 ; } + if ( 2 < shape_type::rank ) { s[3] = s[2] * shape_type::N2 ; } + if ( 3 < shape_type::rank ) { s[4] = s[3] * shape_type::N3 ; } + if ( 4 < shape_type::rank ) { s[5] = s[4] * shape_type::N4 ; } + if ( 5 < shape_type::rank ) { s[6] = s[5] * shape_type::N5 ; } + if ( 6 < shape_type::rank ) { s[7] = s[6] * shape_type::N6 ; } + if ( 7 < shape_type::rank ) { s[8] = s[7] * shape_type::N6 ; } + } + + KOKKOS_INLINE_FUNCTION size_type stride_0() const { return 1 ; } + KOKKOS_INLINE_FUNCTION size_type stride_1() const { return S0 ; } + KOKKOS_INLINE_FUNCTION size_type stride_2() const { return S0 * shape_type::N1 ; } + KOKKOS_INLINE_FUNCTION size_type stride_3() const { return S0 * shape_type::N1 * shape_type::N2 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_4() const + { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_5() const + { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_6() const + { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_7() const + { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 ; } + + // rank 2 + template < typename I0 , typename I1 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const & i0 , I1 const & i1) const + { return i0 + S0 * i1 ; } + + //rank 3 + template <typename I0, typename I1, typename I2> + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const + { + return i0 + S0 * ( + i1 + shape_type::N1 * i2 ); + } + + //rank 4 + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3 ) const + { + return i0 + S0 * ( + i1 + shape_type::N1 * ( + i2 + shape_type::N2 * i3 )); + } + + //rank 5 + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4 ) const + { + return i0 + S0 * ( + i1 + shape_type::N1 * ( + i2 + shape_type::N2 * ( + i3 + shape_type::N3 * i4 ))); + } + + //rank 6 + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5 ) const + { + return i0 + S0 * ( + i1 + shape_type::N1 * ( + i2 + shape_type::N2 * ( + i3 + shape_type::N3 * ( + i4 + shape_type::N4 * i5 )))); + } + + //rank 7 + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5, typename I6 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const + { + return i0 + S0 * ( + i1 + shape_type::N1 * ( + i2 + shape_type::N2 * ( + i3 + shape_type::N3 * ( + i4 + shape_type::N4 * ( + i5 + shape_type::N5 * i6 ))))); + } + + //rank 8 + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5, typename I6, typename I7 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const + { + return i0 + S0 * ( + i1 + shape_type::N1 * ( + i2 + shape_type::N2 * ( + i3 + shape_type::N3 * ( + i4 + shape_type::N4 * ( + i5 + shape_type::N5 * ( + i6 + shape_type::N6 * i7 )))))); + } +}; + +//---------------------------------------------------------------------------- +// LayoutRight AND ( 1 >= rank OR 1 >= rank_dynamic ) : no padding / striding +template < class ShapeType > +struct ViewOffset< ShapeType , LayoutRight + , typename enable_if<( 1 >= ShapeType::rank + || + 1 >= ShapeType::rank_dynamic + )>::type > + : public ShapeType +{ + typedef size_t size_type; + typedef ShapeType shape_type; + typedef LayoutRight array_layout ; + + enum { has_padding = false }; + + // This subview must be 1 == rank and 1 == rank_dynamic + // The source view's last dimension must be non-zero + // Return whether the subview introduced noncontiguity + template< class S , class L > + KOKKOS_INLINE_FUNCTION + typename Impl::enable_if<( 0 == shape_type::rank && + Impl::is_same<L,LayoutRight>::value + ), bool >::type + assign_subview( const ViewOffset<S,L,void> & + , const size_t n0 + , const size_t n1 + , const size_t n2 + , const size_t n3 + , const size_t n4 + , const size_t n5 + , const size_t n6 + , const size_t n7 + ) + { return false ; } + + // This subview must be 1 == rank and 1 == rank_dynamic + // The source view's last dimension must be non-zero + // Return whether the subview introduced noncontiguity + template< class S , class L > + KOKKOS_INLINE_FUNCTION + typename Impl::enable_if<( 1 == shape_type::rank && + 1 == shape_type::rank_dynamic && + 1 <= S::rank && + Impl::is_same<L,LayoutRight>::value + ), bool >::type + assign_subview( const ViewOffset<S,L,void> & + , const size_t n0 + , const size_t n1 + , const size_t n2 + , const size_t n3 + , const size_t n4 + , const size_t n5 + , const size_t n6 + , const size_t n7 + ) + { + shape_type::N0 = S::rank == 1 ? n0 : ( + S::rank == 2 ? n1 : ( + S::rank == 3 ? n2 : ( + S::rank == 4 ? n3 : ( + S::rank == 5 ? n4 : ( + S::rank == 6 ? n5 : ( + S::rank == 7 ? n6 : n7 )))))); + // should have n0 .. n_(rank-2) equal zero + return false ; + } + + template< unsigned R > + KOKKOS_INLINE_FUNCTION + void assign( size_t n ) + { assign_shape_dimension<R>( *this , n ); } + + KOKKOS_INLINE_FUNCTION + void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3 + , size_t n4 , size_t n5 , size_t n6 , size_t n7 + , size_t = 0 ) + { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); } + + template< class ShapeRHS > + KOKKOS_INLINE_FUNCTION + void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs + , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank) + && + int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic) + )>::type * = 0 ) + { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); } + + template< class ShapeRHS > + KOKKOS_INLINE_FUNCTION + void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs + , typename enable_if<( 1 == int(ShapeRHS::rank) + && + 1 == int(shape_type::rank) + && + 1 == int(shape_type::rank_dynamic) + )>::type * = 0 ) + { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); } + + KOKKOS_INLINE_FUNCTION + void set_padding() {} + + KOKKOS_INLINE_FUNCTION + size_type cardinality() const + { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; } + + KOKKOS_INLINE_FUNCTION + size_type capacity() const + { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; } + + size_type stride_R() const + { + return size_type(shape_type::N1) * shape_type::N2 * shape_type::N3 * + shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; + }; + + // Stride with [rank] as total length + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { + size_type n = 1 ; + if ( 7 < shape_type::rank ) { s[7] = n ; n *= shape_type::N7 ; } + if ( 6 < shape_type::rank ) { s[6] = n ; n *= shape_type::N6 ; } + if ( 5 < shape_type::rank ) { s[5] = n ; n *= shape_type::N5 ; } + if ( 4 < shape_type::rank ) { s[4] = n ; n *= shape_type::N4 ; } + if ( 3 < shape_type::rank ) { s[3] = n ; n *= shape_type::N3 ; } + if ( 2 < shape_type::rank ) { s[2] = n ; n *= shape_type::N2 ; } + if ( 1 < shape_type::rank ) { s[1] = n ; n *= shape_type::N1 ; } + if ( 0 < shape_type::rank ) { s[0] = n ; } + s[shape_type::rank] = n * shape_type::N0 ; + } + + KOKKOS_INLINE_FUNCTION + size_type stride_7() const { return 1 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_6() const { return shape_type::N7 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_5() const { return shape_type::N7 * shape_type::N6 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_4() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_3() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_2() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_1() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_0() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 * shape_type::N1 ; } + + // rank 1 + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0) const + { + return i0 ; + } + + // rank 2 + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1 ) const + { + return i1 + shape_type::N1 * i0 ; + } + + template <typename I0, typename I1, typename I2> + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const + { + return i2 + shape_type::N2 * ( + i1 + shape_type::N1 * ( i0 )); + } + + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const + { + return i3 + shape_type::N3 * ( + i2 + shape_type::N2 * ( + i1 + shape_type::N1 * ( i0 ))); + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const + { + return i4 + shape_type::N4 * ( + i3 + shape_type::N3 * ( + i2 + shape_type::N2 * ( + i1 + shape_type::N1 * ( i0 )))); + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const + { + return i5 + shape_type::N5 * ( + i4 + shape_type::N4 * ( + i3 + shape_type::N3 * ( + i2 + shape_type::N2 * ( + i1 + shape_type::N1 * ( i0 ))))); + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5, typename I6 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const + { + return i6 + shape_type::N6 * ( + i5 + shape_type::N5 * ( + i4 + shape_type::N4 * ( + i3 + shape_type::N3 * ( + i2 + shape_type::N2 * ( + i1 + shape_type::N1 * ( i0 )))))); + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5, typename I6, typename I7 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const + { + return i7 + shape_type::N7 * ( + i6 + shape_type::N6 * ( + i5 + shape_type::N5 * ( + i4 + shape_type::N4 * ( + i3 + shape_type::N3 * ( + i2 + shape_type::N2 * ( + i1 + shape_type::N1 * ( i0 ))))))); + } +}; + +//---------------------------------------------------------------------------- +// LayoutRight AND ( 1 < rank AND 1 < rank_dynamic ) : has padding / striding +template < class ShapeType > +struct ViewOffset< ShapeType , LayoutRight + , typename enable_if<( 1 < ShapeType::rank + && + 1 < ShapeType::rank_dynamic + )>::type > + : public ShapeType +{ + typedef size_t size_type; + typedef ShapeType shape_type; + typedef LayoutRight array_layout ; + + enum { has_padding = true }; + + size_type SR ; + + // This subview must be 2 == rank and 2 == rank_dynamic + // due to only having stride #(rank-1). + // The source dimension #(rank-1) must be non-zero for stride-one leading dimension. + // At most one prior dimension can be non-zero. + // Return whether the subview introduced noncontiguity. + template< class S , class L > + KOKKOS_INLINE_FUNCTION + typename Impl::enable_if<( 2 == shape_type::rank && + 2 == shape_type::rank_dynamic && + 2 <= S::rank && + Impl::is_same<L,LayoutRight>::value + ), bool >::type + assign_subview( const ViewOffset<S,L,void> & rhs + , const size_t n0 + , const size_t n1 + , const size_t n2 + , const size_t n3 + , const size_t n4 + , const size_t n5 + , const size_t n6 + , const size_t n7 + ) + { + const size_type nR = S::rank == 2 ? n1 : ( + S::rank == 3 ? n2 : ( + S::rank == 4 ? n3 : ( + S::rank == 5 ? n4 : ( + S::rank == 6 ? n5 : ( + S::rank == 7 ? n6 : n7 ))))); + + // N0 = first non-zero-dimension + // N1 = last non-zero dimension + // SR = stride for second non-zero dimension + shape_type::N0 = 0 ; + shape_type::N1 = nR ; + SR = 0 ; + + if ( n0 ) { shape_type::N0 = n0 ; SR = rhs.stride_0(); } + else if ( 2 < S::rank && n1 ) { shape_type::N0 = n1 ; SR = rhs.stride_1(); } + else if ( 3 < S::rank && n2 ) { shape_type::N0 = n2 ; SR = rhs.stride_2(); } + else if ( 4 < S::rank && n3 ) { shape_type::N0 = n3 ; SR = rhs.stride_3(); } + else if ( 5 < S::rank && n4 ) { shape_type::N0 = n4 ; SR = rhs.stride_4(); } + else if ( 6 < S::rank && n5 ) { shape_type::N0 = n5 ; SR = rhs.stride_5(); } + else if ( 7 < S::rank && n6 ) { shape_type::N0 = n6 ; SR = rhs.stride_6(); } + + // Introduce noncontiguous if change the last dimension + // or take a range of a dimension other than the second-to-last dimension. + + return 2 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N1) || 0 == n0 ) : ( + 3 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N2) || 0 == n1 ) : ( + 4 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N3) || 0 == n2 ) : ( + 5 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N4) || 0 == n3 ) : ( + 6 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N5) || 0 == n4 ) : ( + 7 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N6) || 0 == n5 ) : ( + ( size_t(shape_type::N1) != size_t(rhs.N7) || 0 == n6 ) )))))); + } + + template< unsigned R > + KOKKOS_INLINE_FUNCTION + void assign( size_t n ) + { assign_shape_dimension<R>( *this , n ); } + + KOKKOS_INLINE_FUNCTION + void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3 + , size_t n4 , size_t n5 , size_t n6 , size_t n7 + , size_t = 0 ) + { + shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); + SR = size_type(shape_type::N1) * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; + } + + template< class ShapeRHS > + KOKKOS_INLINE_FUNCTION + void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs + , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank) + && + int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic) + && + int(ShapeRHS::rank_dynamic) <= 1 + )>::type * = 0 ) + { + shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); + SR = shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; + } + + template< class ShapeRHS > + KOKKOS_INLINE_FUNCTION + void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs + , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank) + && + int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic) + && + int(ShapeRHS::rank_dynamic) > 1 + )>::type * = 0 ) + { + shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); + SR = rhs.SR ; + } + + KOKKOS_INLINE_FUNCTION + void set_padding() + { + enum { div = MEMORY_ALIGNMENT / shape_type::scalar_size }; + enum { mod = MEMORY_ALIGNMENT % shape_type::scalar_size }; + enum { align = 0 == mod ? div : 0 }; + + if ( align && MEMORY_ALIGNMENT_THRESHOLD * align < SR ) { + + const size_type count_mod = SR % ( div ? div : 1 ); + + if ( count_mod ) { SR += align - count_mod ; } + } + } + + KOKKOS_INLINE_FUNCTION + size_type cardinality() const + { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; } + + KOKKOS_INLINE_FUNCTION + size_type capacity() const { return shape_type::N0 * SR ; } + + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { + size_type n = 1 ; + if ( 7 < shape_type::rank ) { s[7] = n ; n *= shape_type::N7 ; } + if ( 6 < shape_type::rank ) { s[6] = n ; n *= shape_type::N6 ; } + if ( 5 < shape_type::rank ) { s[5] = n ; n *= shape_type::N5 ; } + if ( 4 < shape_type::rank ) { s[4] = n ; n *= shape_type::N4 ; } + if ( 3 < shape_type::rank ) { s[3] = n ; n *= shape_type::N3 ; } + if ( 2 < shape_type::rank ) { s[2] = n ; n *= shape_type::N2 ; } + if ( 1 < shape_type::rank ) { s[1] = n ; n *= shape_type::N1 ; } + if ( 0 < shape_type::rank ) { s[0] = SR ; } + s[shape_type::rank] = SR * shape_type::N0 ; + } + + KOKKOS_INLINE_FUNCTION + size_type stride_7() const { return 1 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_6() const { return shape_type::N7 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_5() const { return shape_type::N7 * shape_type::N6 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_4() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_3() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_2() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_1() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_0() const { return SR ; } + + // rank 2 + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1 ) const + { + return i1 + i0 * SR ; + } + + template <typename I0, typename I1, typename I2> + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const + { + return i2 + shape_type::N2 * ( i1 ) + + i0 * SR ; + } + + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const + { + return i3 + shape_type::N3 * ( + i2 + shape_type::N2 * ( i1 )) + + i0 * SR ; + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const + { + return i4 + shape_type::N4 * ( + i3 + shape_type::N3 * ( + i2 + shape_type::N2 * ( i1 ))) + + i0 * SR ; + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const + { + return i5 + shape_type::N5 * ( + i4 + shape_type::N4 * ( + i3 + shape_type::N3 * ( + i2 + shape_type::N2 * ( i1 )))) + + i0 * SR ; + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5, typename I6 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const + { + return i6 + shape_type::N6 * ( + i5 + shape_type::N5 * ( + i4 + shape_type::N4 * ( + i3 + shape_type::N3 * ( + i2 + shape_type::N2 * ( i1 ))))) + + i0 * SR ; + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5, typename I6, typename I7 > + KOKKOS_FORCEINLINE_FUNCTION + size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const + { + return i7 + shape_type::N7 * ( + i6 + shape_type::N6 * ( + i5 + shape_type::N5 * ( + i4 + shape_type::N4 * ( + i3 + shape_type::N3 * ( + i2 + shape_type::N2 * ( i1 )))))) + + i0 * SR ; + } +}; + +//---------------------------------------------------------------------------- +// LayoutStride : +template < class ShapeType > +struct ViewOffset< ShapeType , LayoutStride + , typename enable_if<( 0 < ShapeType::rank )>::type > + : public ShapeType +{ + typedef size_t size_type; + typedef ShapeType shape_type; + typedef LayoutStride array_layout ; + + size_type S[ shape_type::rank + 1 ]; + + template< class SType , class L > + KOKKOS_INLINE_FUNCTION + bool assign_subview( const ViewOffset<SType,L,void> & rhs + , const size_type n0 + , const size_type n1 + , const size_type n2 + , const size_type n3 + , const size_type n4 + , const size_type n5 + , const size_type n6 + , const size_type n7 + ) + { + shape_type::assign( *this, 0,0,0,0, 0,0,0,0 ); + + for ( int i = 0 ; i < int(shape_type::rank+1) ; ++i ) { S[i] = 0 ; } + + // preconditions: + // shape_type::rank <= rhs.rank + // shape_type::rank == count of nonzero( rhs_dim[i] ) + size_type dim[8] = { n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 }; + size_type str[ SType::rank + 1 ]; + + rhs.stride( str ); + + // contract the zero-dimensions + int r = 0 ; + for ( int i = 0 ; i < int(SType::rank) ; ++i ) { + if ( 0 != dim[i] ) { + dim[r] = dim[i] ; + str[r] = str[i] ; + ++r ; + } + } + + if ( int(shape_type::rank) == r ) { + // The shape is non-zero + for ( int i = 0 ; i < int(shape_type::rank) ; ++i ) { + const size_type cap = dim[i] * ( S[i] = str[i] ); + if ( S[ shape_type::rank ] < cap ) S[ shape_type::rank ] = cap ; + } + // set the contracted nonzero dimensions + shape_type::assign( *this, dim[0], dim[1], dim[2], dim[3], dim[4], dim[5], dim[6], dim[7] ); + } + + return true ; // definitely noncontiguous + } + + template< unsigned R > + KOKKOS_INLINE_FUNCTION + void assign( size_t n ) + { assign_shape_dimension<R>( *this , n ); } + + template< class ShapeRHS , class Layout > + KOKKOS_INLINE_FUNCTION + void assign( const ViewOffset<ShapeRHS,Layout> & rhs + , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank) )>::type * = 0 ) + { + rhs.stride(S); + shape_type::assign( *this, rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); + } + + KOKKOS_INLINE_FUNCTION + void assign( const LayoutStride & layout ) + { + size_type max = 0 ; + for ( int i = 0 ; i < shape_type::rank ; ++i ) { + S[i] = layout.stride[i] ; + const size_type m = layout.dimension[i] * S[i] ; + if ( max < m ) { max = m ; } + } + S[ shape_type::rank ] = max ; + shape_type::assign( *this, layout.dimension[0], layout.dimension[1], + layout.dimension[2], layout.dimension[3], + layout.dimension[4], layout.dimension[5], + layout.dimension[6], layout.dimension[7] ); + } + + KOKKOS_INLINE_FUNCTION + void assign( size_t s0 , size_t s1 , size_t s2 , size_t s3 + , size_t s4 , size_t s5 , size_t s6 , size_t s7 + , size_t s8 ) + { + const size_t str[9] = { s0, s1, s2, s3, s4, s5, s6, s7, s8 }; + + // Last argument is the total length. + // Total length must be non-zero. + // All strides must be non-zero and less than total length. + bool ok = 0 < str[ shape_type::rank ] ; + + for ( int i = 0 ; ( i < shape_type::rank ) && + ( ok = 0 < str[i] && str[i] < str[ shape_type::rank ] ); ++i ); + + if ( ok ) { + size_t dim[8] = { 1,1,1,1,1,1,1,1 }; + int iorder[9] = { 0,0,0,0,0,0,0,0,0 }; + + // Ordering of strides smallest to largest. + for ( int i = 1 ; i < shape_type::rank ; ++i ) { + int j = i ; + for ( ; 0 < j && str[i] < str[ iorder[j-1] ] ; --j ) { + iorder[j] = iorder[j-1] ; + } + iorder[j] = i ; + } + + // Last argument is the total length. + iorder[ shape_type::rank ] = shape_type::rank ; + + // Determine dimension associated with each stride. + // Guarantees non-overlap by truncating dimension + // if ( 0 != str[ iorder[i+1] ] % str[ iorder[i] ] ) + for ( int i = 0 ; i < shape_type::rank ; ++i ) { + dim[ iorder[i] ] = str[ iorder[i+1] ] / str[ iorder[i] ] ; + } + + // Assign dimensions and strides: + shape_type::assign( *this, dim[0], dim[1], dim[2], dim[3], dim[4], dim[5], dim[6], dim[7] ); + for ( int i = 0 ; i <= shape_type::rank ; ++i ) { S[i] = str[i] ; } + } + else { + shape_type::assign(*this,0,0,0,0,0,0,0,0); + for ( int i = 0 ; i <= shape_type::rank ; ++i ) { S[i] = 0 ; } + } + } + + KOKKOS_INLINE_FUNCTION + void set_padding() {} + + KOKKOS_INLINE_FUNCTION + size_type cardinality() const + { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; } + + KOKKOS_INLINE_FUNCTION + size_type capacity() const { return S[ shape_type::rank ]; } + + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { for ( int i = 0 ; i <= shape_type::rank ; ++i ) { s[i] = S[i] ; } } + + KOKKOS_INLINE_FUNCTION + size_type stride_0() const { return S[0] ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_1() const { return S[1] ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_2() const { return S[2] ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_3() const { return S[3] ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_4() const { return S[4] ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_5() const { return S[5] ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_6() const { return S[6] ; } + + KOKKOS_INLINE_FUNCTION + size_type stride_7() const { return S[7] ; } + + // rank 1 + template <typename I0 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==1),size_type>::type + operator()( I0 const& i0) const + { + return i0 * S[0] ; + } + + // rank 2 + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==2),size_type>::type + operator()( I0 const& i0, I1 const& i1 ) const + { + return i0 * S[0] + i1 * S[1] ; + } + + template <typename I0, typename I1, typename I2> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==3),size_type>::type + operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const + { + return i0 * S[0] + i1 * S[1] + i2 * S[2] ; + } + + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==4),size_type>::type + operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const + { + return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] ; + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==5),size_type>::type + operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const + { + return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] ; + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==6),size_type>::type + operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const + { + return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] ; + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5, typename I6 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==7),size_type>::type + operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const + { + return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] + i6 * S[6] ; + } + + template < typename I0, typename I1, typename I2, typename I3 + ,typename I4, typename I5, typename I6, typename I7 > + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==8),size_type>::type + operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const + { + return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] + i6 * S[6] + i7 * S[7] ; + } +}; + +//---------------------------------------------------------------------------- + +template< class T > +struct ViewOffsetRange { + + enum { OK_integral_type = Impl::StaticAssert< Impl::is_integral<T>::value >::value }; + + enum { is_range = false }; + + KOKKOS_INLINE_FUNCTION static + size_t dimension( size_t const , T const & ) { return 0 ; } + + KOKKOS_INLINE_FUNCTION static + size_t begin( T const & i ) { return size_t(i) ; } +}; + +template<> +struct ViewOffsetRange<void> { + enum { is_range = false }; +}; + +template<> +struct ViewOffsetRange< Kokkos::ALL > { + enum { is_range = true }; + + KOKKOS_INLINE_FUNCTION static + size_t dimension( size_t const n , ALL const & ) { return n ; } + + KOKKOS_INLINE_FUNCTION static + size_t begin( ALL const & ) { return 0 ; } +}; + +template< typename iType > +struct ViewOffsetRange< std::pair<iType,iType> > { + + enum { OK_integral_type = Impl::StaticAssert< Impl::is_integral<iType>::value >::value }; + + enum { is_range = true }; + + KOKKOS_INLINE_FUNCTION static + size_t dimension( size_t const n , std::pair<iType,iType> const & r ) + { return ( size_t(r.first) < size_t(r.second) && size_t(r.second) <= n ) ? size_t(r.second) - size_t(r.first) : 0 ; } + + KOKKOS_INLINE_FUNCTION static + size_t begin( std::pair<iType,iType> const & r ) { return size_t(r.first) ; } +}; + +template< typename iType > +struct ViewOffsetRange< Kokkos::pair<iType,iType> > { + + enum { OK_integral_type = Impl::StaticAssert< Impl::is_integral<iType>::value >::value }; + + enum { is_range = true }; + + KOKKOS_INLINE_FUNCTION static + size_t dimension( size_t const n , Kokkos::pair<iType,iType> const & r ) + { return ( size_t(r.first) < size_t(r.second) && size_t(r.second) <= n ) ? size_t(r.second) - size_t(r.first) : 0 ; } + + KOKKOS_INLINE_FUNCTION static + size_t begin( Kokkos::pair<iType,iType> const & r ) { return size_t(r.first) ; } +}; + +}} // namespace Kokkos::Impl + +#endif //KOKKOS_VIEWOFFSET_HPP + diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp new file mode 100755 index 0000000000000000000000000000000000000000..006b35923d0adb9103979ee2873ea53f2254bdc3 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp @@ -0,0 +1,518 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_VIEWSUPPORT_HPP +#define KOKKOS_VIEWSUPPORT_HPP + +#include <Kokkos_ExecPolicy.hpp> +#include <impl/Kokkos_Shape.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief Evaluate if LHS = RHS view assignment is allowed. */ +template< class ViewLHS , class ViewRHS > +struct ViewAssignable +{ + // Same memory space. + // Same value type. + // Compatible 'const' qualifier + // Cannot assign managed = unmannaged + enum { assignable_value = + ( is_same< typename ViewLHS::value_type , + typename ViewRHS::value_type >::value + || + is_same< typename ViewLHS::value_type , + typename ViewRHS::const_value_type >::value ) + && + is_same< typename ViewLHS::memory_space , + typename ViewRHS::memory_space >::value + && + ( ! ( ViewLHS::is_managed && ! ViewRHS::is_managed ) ) + }; + + enum { assignable_shape = + // Compatible shape and matching layout: + ( ShapeCompatible< typename ViewLHS::shape_type , + typename ViewRHS::shape_type >::value + && + is_same< typename ViewLHS::array_layout , + typename ViewRHS::array_layout >::value ) + || + // Matching layout, same rank, and LHS dynamic rank + ( is_same< typename ViewLHS::array_layout , + typename ViewRHS::array_layout >::value + && + int(ViewLHS::rank) == int(ViewRHS::rank) + && + int(ViewLHS::rank) == int(ViewLHS::rank_dynamic) ) + || + // Both rank-0, any shape and layout + ( int(ViewLHS::rank) == 0 && int(ViewRHS::rank) == 0 ) + || + // Both rank-1 and LHS is dynamic rank-1, any shape and layout + ( int(ViewLHS::rank) == 1 && int(ViewRHS::rank) == 1 && + int(ViewLHS::rank_dynamic) == 1 ) + }; + + enum { value = assignable_value && assignable_shape }; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class ExecSpace , class Type , bool Initialize > +struct ViewDefaultConstruct +{ ViewDefaultConstruct( Type * , size_t ) {} }; + + +/** \brief ViewDataHandle provides the type of the 'data handle' which the view + * uses to access data with the [] operator. It also provides + * an allocate function and a function to extract a raw ptr from the + * data handle. ViewDataHandle also defines an enum ReferenceAble which + * specifies whether references/pointers to elements can be taken and a + * 'return_type' which is what the view operators will give back. + * Specialisation of this object allows three things depending + * on ViewTraits and compiler options: + * (i) Use special allocator (e.g. huge pages/small pages and pinned memory) + * (ii) Use special data handle type (e.g. add Cuda Texture Object) + * (iii) Use special access intrinsics (e.g. texture fetch and non-caching loads) + */ +template< class StaticViewTraits , class Enable = void > +struct ViewDataHandle { + + enum { ReturnTypeIsReference = true }; + + typedef typename StaticViewTraits::value_type * handle_type; + typedef typename StaticViewTraits::value_type & return_type; + + KOKKOS_INLINE_FUNCTION + static handle_type create_handle( typename StaticViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ ) + { + return handle_type(arg_data_ptr); + } +}; + +template< class StaticViewTraits , class Enable = void > +class ViewDataManagement : public ViewDataHandle< StaticViewTraits > { +private: + + template< class , class > friend class ViewDataManagement ; + + struct PotentiallyManaged {}; + struct StaticallyUnmanaged {}; + + /* Statically unmanaged if traits or not executing in host-accessible memory space */ + typedef typename + Impl::if_c< StaticViewTraits::is_managed && + Impl::is_same< Kokkos::HostSpace + , Kokkos::Impl::ActiveExecutionMemorySpace >::value + , PotentiallyManaged + , StaticallyUnmanaged + >::type StaticManagementTag ; + + enum { Unmanaged = 0x01 + , Noncontiguous = 0x02 + }; + + enum { DefaultTraits = Impl::is_same< StaticManagementTag , StaticallyUnmanaged >::value ? Unmanaged : 0 }; + + unsigned m_traits ; ///< Runtime traits + + + template< class T > + inline static + unsigned assign( const ViewDataManagement<T> & rhs , const PotentiallyManaged & ) + { return rhs.m_traits | ( rhs.is_managed() && Kokkos::HostSpace::in_parallel() ? unsigned(Unmanaged) : 0u ); } + + template< class T > + KOKKOS_INLINE_FUNCTION static + unsigned assign( const ViewDataManagement<T> & rhs , const StaticallyUnmanaged & ) + { return rhs.m_traits | Unmanaged ; } + +public: + + typedef typename ViewDataHandle< StaticViewTraits >::handle_type handle_type; + + KOKKOS_INLINE_FUNCTION + ViewDataManagement() : m_traits( DefaultTraits ) {} + + KOKKOS_INLINE_FUNCTION + ViewDataManagement( const ViewDataManagement & rhs ) + : m_traits( assign( rhs , StaticManagementTag() ) ) {} + + KOKKOS_INLINE_FUNCTION + ViewDataManagement & operator = ( const ViewDataManagement & rhs ) + { m_traits = assign( rhs , StaticManagementTag() ); return *this ; } + + template< class SVT > + KOKKOS_INLINE_FUNCTION + ViewDataManagement( const ViewDataManagement<SVT> & rhs ) + : m_traits( assign( rhs , StaticManagementTag() ) ) {} + + template< class SVT > + KOKKOS_INLINE_FUNCTION + ViewDataManagement & operator = ( const ViewDataManagement<SVT> & rhs ) + { m_traits = assign( rhs , StaticManagementTag() ); return *this ; } + + KOKKOS_INLINE_FUNCTION + bool is_managed() const { return ! ( m_traits & Unmanaged ); } + + KOKKOS_INLINE_FUNCTION + bool is_contiguous() const { return ! ( m_traits & Noncontiguous ); } + + KOKKOS_INLINE_FUNCTION + void set_unmanaged() { m_traits |= Unmanaged ; } + + KOKKOS_INLINE_FUNCTION + void set_noncontiguous() { m_traits |= Noncontiguous ; } + + template< bool Initialize > + static + handle_type allocate( const std::string & label + , const Impl::ViewOffset< typename StaticViewTraits::shape_type, typename StaticViewTraits::array_layout > & offset_map + , AllocationTracker & tracker + ) + { + typedef typename StaticViewTraits::execution_space execution_space ; + typedef typename StaticViewTraits::memory_space memory_space ; + typedef typename StaticViewTraits::value_type value_type ; + + const size_t count = offset_map.capacity(); + + tracker = memory_space::allocate_and_track( label, sizeof(value_type) * count ); + + value_type * ptr = reinterpret_cast<value_type *>(tracker.alloc_ptr()); + + // Default construct within the view's execution space. + (void) ViewDefaultConstruct< execution_space , value_type , Initialize >( ptr , count ); + + return ViewDataHandle< StaticViewTraits >::create_handle(ptr, tracker); + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class OutputView , class InputView , unsigned Rank = OutputView::Rank > +struct ViewRemap +{ + typedef typename OutputView::size_type size_type ; + + const OutputView output ; + const InputView input ; + const size_type n0 ; + const size_type n1 ; + const size_type n2 ; + const size_type n3 ; + const size_type n4 ; + const size_type n5 ; + const size_type n6 ; + const size_type n7 ; + + ViewRemap( const OutputView & arg_out , const InputView & arg_in ) + : output( arg_out ), input( arg_in ) + , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) ) + , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) ) + , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) ) + , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) ) + , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) ) + , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) ) + , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) ) + , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) ) + { + typedef typename OutputView::execution_space execution_space ; + Kokkos::RangePolicy< execution_space > range( 0 , n0 ); + parallel_for( range , *this ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_type i0 ) const + { + for ( size_type i1 = 0 ; i1 < n1 ; ++i1 ) { + for ( size_type i2 = 0 ; i2 < n2 ; ++i2 ) { + for ( size_type i3 = 0 ; i3 < n3 ; ++i3 ) { + for ( size_type i4 = 0 ; i4 < n4 ; ++i4 ) { + for ( size_type i5 = 0 ; i5 < n5 ; ++i5 ) { + for ( size_type i6 = 0 ; i6 < n6 ; ++i6 ) { + for ( size_type i7 = 0 ; i7 < n7 ; ++i7 ) { + output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input.at(i0,i1,i2,i3,i4,i5,i6,i7); + }}}}}}} + } +}; + +template< class OutputView , class InputView > +struct ViewRemap< OutputView , InputView , 0 > +{ + typedef typename OutputView::value_type value_type ; + typedef typename OutputView::memory_space dst_space ; + typedef typename InputView ::memory_space src_space ; + + ViewRemap( const OutputView & arg_out , const InputView & arg_in ) + { + DeepCopy< dst_space , src_space >( arg_out.ptr_on_device() , + arg_in.ptr_on_device() , + sizeof(value_type) ); + } +}; + +//---------------------------------------------------------------------------- + +template< class ExecSpace , class Type > +struct ViewDefaultConstruct< ExecSpace , Type , true > +{ + Type * const m_ptr ; + + KOKKOS_FORCEINLINE_FUNCTION + void operator()( const typename ExecSpace::size_type& i ) const + { m_ptr[i] = Type(); } + + ViewDefaultConstruct( Type * pointer , size_t capacity ) + : m_ptr( pointer ) + { + Kokkos::RangePolicy< ExecSpace > range( 0 , capacity ); + parallel_for( range , *this ); + ExecSpace::fence(); + } +}; + +template< class OutputView , unsigned Rank = OutputView::Rank , + class Enabled = void > +struct ViewFill +{ + typedef typename OutputView::const_value_type const_value_type ; + typedef typename OutputView::size_type size_type ; + + const OutputView output ; + const_value_type input ; + + ViewFill( const OutputView & arg_out , const_value_type & arg_in ) + : output( arg_out ), input( arg_in ) + { + typedef typename OutputView::execution_space execution_space ; + Kokkos::RangePolicy< execution_space > range( 0 , output.dimension_0() ); + parallel_for( range , *this ); + execution_space::fence(); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_type i0 ) const + { + for ( size_type i1 = 0 ; i1 < output.dimension_1() ; ++i1 ) { + for ( size_type i2 = 0 ; i2 < output.dimension_2() ; ++i2 ) { + for ( size_type i3 = 0 ; i3 < output.dimension_3() ; ++i3 ) { + for ( size_type i4 = 0 ; i4 < output.dimension_4() ; ++i4 ) { + for ( size_type i5 = 0 ; i5 < output.dimension_5() ; ++i5 ) { + for ( size_type i6 = 0 ; i6 < output.dimension_6() ; ++i6 ) { + for ( size_type i7 = 0 ; i7 < output.dimension_7() ; ++i7 ) { + output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input ; + }}}}}}} + } +}; + +template< class OutputView > +struct ViewFill< OutputView , 0 > +{ + typedef typename OutputView::const_value_type const_value_type ; + typedef typename OutputView::memory_space dst_space ; + + ViewFill( const OutputView & arg_out , const_value_type & arg_in ) + { + DeepCopy< dst_space , dst_space >( arg_out.ptr_on_device() , & arg_in , + sizeof(const_value_type) ); + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +struct ViewAllocateWithoutInitializing { + + const std::string label ; + + ViewAllocateWithoutInitializing() : label() {} + ViewAllocateWithoutInitializing( const std::string & arg_label ) : label( arg_label ) {} + ViewAllocateWithoutInitializing( const char * const arg_label ) : label( arg_label ) {} +}; + +struct ViewAllocate { + + const std::string label ; + + ViewAllocate() : label() {} + ViewAllocate( const std::string & arg_label ) : label( arg_label ) {} + ViewAllocate( const char * const arg_label ) : label( arg_label ) {} +}; + +} + +namespace Kokkos { +namespace Impl { + +template< class Traits , class AllocationProperties , class Enable = void > +struct ViewAllocProp : public Kokkos::Impl::false_type {}; + +template< class Traits > +struct ViewAllocProp< Traits , Kokkos::ViewAllocate + , typename Kokkos::Impl::enable_if<( + Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value + )>::type > + : public Kokkos::Impl::true_type +{ + typedef size_t size_type ; + typedef const ViewAllocate & property_type ; + + enum { Initialize = true }; + enum { AllowPadding = false }; + + inline + static const std::string & label( property_type p ) { return p.label ; } +}; + +template< class Traits > +struct ViewAllocProp< Traits , std::string + , typename Kokkos::Impl::enable_if<( + Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value + )>::type > + : public Kokkos::Impl::true_type +{ + typedef size_t size_type ; + typedef const std::string & property_type ; + + enum { Initialize = true }; + enum { AllowPadding = false }; + + inline + static const std::string & label( property_type s ) { return s ; } +}; + +template< class Traits , unsigned N > +struct ViewAllocProp< Traits , char[N] + , typename Kokkos::Impl::enable_if<( + Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value + )>::type > + : public Kokkos::Impl::true_type +{ +private: + typedef char label_type[N] ; +public: + + typedef size_t size_type ; + typedef const label_type & property_type ; + + enum { Initialize = true }; + enum { AllowPadding = false }; + + inline + static std::string label( property_type s ) { return std::string(s) ; } +}; + +template< class Traits > +struct ViewAllocProp< Traits , Kokkos::ViewAllocateWithoutInitializing + , typename Kokkos::Impl::enable_if<( + Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value + )>::type > + : public Kokkos::Impl::true_type +{ + typedef size_t size_type ; + typedef const Kokkos::ViewAllocateWithoutInitializing & property_type ; + + enum { Initialize = false }; + enum { AllowPadding = false }; + + inline + static std::string label( property_type s ) { return s.label ; } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class Traits , class PointerProperties , class Enable = void > +struct ViewRawPointerProp : public Kokkos::Impl::false_type {}; + +template< class Traits , typename T > +struct ViewRawPointerProp< Traits , T , + typename Kokkos::Impl::enable_if<( + Impl::is_same< T , typename Traits::value_type >::value || + Impl::is_same< T , typename Traits::non_const_value_type >::value + )>::type > + : public Kokkos::Impl::true_type +{ + typedef size_t size_type ; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_VIEWSUPPORT_HPP */ + + diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp new file mode 100755 index 0000000000000000000000000000000000000000..91d30927a63c8a92f6876a40137ede764e0babab --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp @@ -0,0 +1,195 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_VIEWTILELEFT_HPP +#define KOKKOS_VIEWTILELEFT_HPP + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template< class T , unsigned N0 , unsigned N1 , class MemorySpace , class MemoryTraits > +struct ViewSpecialize< T , void , LayoutTileLeft<N0,N1> , MemorySpace , MemoryTraits > +{ + typedef ViewDefault type ; +}; + +struct ViewTile {}; + +template< class ShapeType , unsigned N0 , unsigned N1 > +struct ViewOffset< ShapeType + , LayoutTileLeft<N0,N1,true> /* Only accept properly shaped tiles */ + , typename Impl::enable_if<( 2 == ShapeType::rank + && + 2 == ShapeType::rank_dynamic + )>::type > + : public ShapeType +{ + enum { SHIFT_0 = Impl::power_of_two<N0>::value }; + enum { SHIFT_1 = Impl::power_of_two<N1>::value }; + enum { MASK_0 = N0 - 1 }; + enum { MASK_1 = N1 - 1 }; + + typedef size_t size_type ; + typedef ShapeType shape_type ; + typedef LayoutTileLeft<N0,N1,true> array_layout ; + + enum { has_padding = true }; + + size_type tile_N0 ; + + KOKKOS_INLINE_FUNCTION + void assign( const ViewOffset & rhs ) + { + shape_type::N0 = rhs.N0 ; + shape_type::N1 = rhs.N1 ; + tile_N0 = ( rhs.N0 + MASK_0 ) >> SHIFT_0 ; // number of tiles in first dimension + } + + KOKKOS_INLINE_FUNCTION + void assign( size_t n0 , size_t n1 + , int = 0 , int = 0 + , int = 0 , int = 0 + , int = 0 , int = 0 + , int = 0 + ) + { + shape_type::N0 = n0 ; + shape_type::N1 = n1 ; + tile_N0 = ( n0 + MASK_0 ) >> SHIFT_0 ; // number of tiles in first dimension + } + + + KOKKOS_INLINE_FUNCTION + void set_padding() {} + + + template< typename I0 , typename I1 > + KOKKOS_INLINE_FUNCTION + size_type operator()( I0 const & i0 , I1 const & i1 + , int = 0 , int = 0 + , int = 0 , int = 0 + , int = 0 , int = 0 + ) const + { + return /* ( ( Tile offset ) * ( Tile size ) ) */ + ( ( (i0>>SHIFT_0) + tile_N0 * (i1>>SHIFT_1) ) << (SHIFT_0 + SHIFT_1) ) + + /* ( Offset within tile ) */ + ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ; + } + + template< typename I0 , typename I1 > + KOKKOS_INLINE_FUNCTION + size_type tile_begin( I0 const & i_tile0 , I1 const & i_tile1 ) const + { + return ( i_tile0 + tile_N0 * i_tile1 ) << ( SHIFT_0 + SHIFT_1 ); + } + + + KOKKOS_INLINE_FUNCTION + size_type capacity() const + { + // ( TileDim0 * ( TileDim1 ) ) * TileSize + return ( tile_N0 * ( ( shape_type::N1 + MASK_1 ) >> SHIFT_1 ) ) << ( SHIFT_0 + SHIFT_1 ); + } +}; + +template<> +struct ViewAssignment< ViewTile , void , void > +{ + // Some compilers have type-matching issues on the integer values when using: + // template< class T , unsigned N0 , unsigned N1 , class A2 , class A3 > + template< class T , unsigned dN0 , unsigned dN1 + , class A2 , class A3 + , unsigned sN0 , unsigned sN1 > + KOKKOS_INLINE_FUNCTION + ViewAssignment( View< T[dN0][dN1], LayoutLeft, A2, A3, Impl::ViewDefault > & dst + , View< T** , LayoutTileLeft<sN0,sN1,true>, A2, A3, Impl::ViewDefault > const & src + , size_t const i_tile0 + , typename Impl::enable_if< unsigned(dN0) == unsigned(sN0) && + unsigned(dN1) == unsigned(sN1) + , size_t const + >::type i_tile1 + ) + { + // Destination is always contiguous but source may be non-contiguous + // so don't assign the whole view management object. + // Just query and appropriately set the reference-count state. + + if ( ! src.m_management.is_managed() ) dst.m_management.set_unmanaged(); + + dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map.tile_begin(i_tile0,i_tile1); + + dst.m_tracker = src.m_tracker; + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +namespace Kokkos { + +template< class T , unsigned N0, unsigned N1, class A2, class A3 > +KOKKOS_INLINE_FUNCTION +View< T[N0][N1], LayoutLeft, A2, A3, Impl::ViewDefault > +tile_subview( const View<T**,LayoutTileLeft<N0,N1,true>,A2,A3,Impl::ViewDefault> & src + , const size_t i_tile0 + , const size_t i_tile1 + ) +{ + View< T[N0][N1], LayoutLeft, A2, A3, Impl::ViewDefault > dst ; + + (void) Impl::ViewAssignment< Impl::ViewTile , void , void >( dst , src , i_tile0 , i_tile1 ); + + return dst ; +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_VIEWTILELEFT_HPP */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp b/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp new file mode 100755 index 0000000000000000000000000000000000000000..420ee63891e6ddb0995ad7bbbcfba2f0548c2bd9 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp @@ -0,0 +1,242 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_VOLATILE_LOAD ) +#define KOKKOS_VOLATILE_LOAD + +#if defined( __GNUC__ ) /* GNU C */ || \ + defined( __GNUG__ ) /* GNU C++ */ || \ + defined( __clang__ ) + +#define KOKKOS_MAY_ALIAS __attribute__((__may_alias__)) + +#else + +#define KOKKOS_MAY_ALIAS + +#endif + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION +T volatile_load(T const volatile * const src_ptr) +{ + typedef uint64_t KOKKOS_MAY_ALIAS T64; + typedef uint32_t KOKKOS_MAY_ALIAS T32; + typedef uint16_t KOKKOS_MAY_ALIAS T16; + typedef uint8_t KOKKOS_MAY_ALIAS T8; + + enum { + NUM_8 = sizeof(T), + NUM_16 = NUM_8 / 2, + NUM_32 = NUM_8 / 4, + NUM_64 = NUM_8 / 8 + }; + + union { + T const volatile * const ptr; + T64 const volatile * const ptr64; + T32 const volatile * const ptr32; + T16 const volatile * const ptr16; + T8 const volatile * const ptr8; + } src = {src_ptr}; + + T result; + + union { + T * const ptr; + T64 * const ptr64; + T32 * const ptr32; + T16 * const ptr16; + T8 * const ptr8; + } dst = {&result}; + + for (int i=0; i < NUM_64; ++i) { + dst.ptr64[i] = src.ptr64[i]; + } + + if ( NUM_64*2 < NUM_32 ) { + dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2]; + } + + if ( NUM_32*2 < NUM_16 ) { + dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2]; + } + + if ( NUM_16*2 < NUM_8 ) { + dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2]; + } + + return result; +} + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION +void volatile_store(T volatile * const dst_ptr, T const volatile * const src_ptr) +{ + typedef uint64_t KOKKOS_MAY_ALIAS T64; + typedef uint32_t KOKKOS_MAY_ALIAS T32; + typedef uint16_t KOKKOS_MAY_ALIAS T16; + typedef uint8_t KOKKOS_MAY_ALIAS T8; + + enum { + NUM_8 = sizeof(T), + NUM_16 = NUM_8 / 2, + NUM_32 = NUM_8 / 4, + NUM_64 = NUM_8 / 8 + }; + + union { + T const volatile * const ptr; + T64 const volatile * const ptr64; + T32 const volatile * const ptr32; + T16 const volatile * const ptr16; + T8 const volatile * const ptr8; + } src = {src_ptr}; + + union { + T volatile * const ptr; + T64 volatile * const ptr64; + T32 volatile * const ptr32; + T16 volatile * const ptr16; + T8 volatile * const ptr8; + } dst = {dst_ptr}; + + for (int i=0; i < NUM_64; ++i) { + dst.ptr64[i] = src.ptr64[i]; + } + + if ( NUM_64*2 < NUM_32 ) { + dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2]; + } + + if ( NUM_32*2 < NUM_16 ) { + dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2]; + } + + if ( NUM_16*2 < NUM_8 ) { + dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2]; + } +} + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION +void volatile_store(T volatile * const dst_ptr, T const * const src_ptr) +{ + typedef uint64_t KOKKOS_MAY_ALIAS T64; + typedef uint32_t KOKKOS_MAY_ALIAS T32; + typedef uint16_t KOKKOS_MAY_ALIAS T16; + typedef uint8_t KOKKOS_MAY_ALIAS T8; + + enum { + NUM_8 = sizeof(T), + NUM_16 = NUM_8 / 2, + NUM_32 = NUM_8 / 4, + NUM_64 = NUM_8 / 8 + }; + + union { + T const * const ptr; + T64 const * const ptr64; + T32 const * const ptr32; + T16 const * const ptr16; + T8 const * const ptr8; + } src = {src_ptr}; + + union { + T volatile * const ptr; + T64 volatile * const ptr64; + T32 volatile * const ptr32; + T16 volatile * const ptr16; + T8 volatile * const ptr8; + } dst = {dst_ptr}; + + for (int i=0; i < NUM_64; ++i) { + dst.ptr64[i] = src.ptr64[i]; + } + + if ( NUM_64*2 < NUM_32 ) { + dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2]; + } + + if ( NUM_32*2 < NUM_16 ) { + dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2]; + } + + if ( NUM_16*2 < NUM_8 ) { + dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2]; + } +} + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION +void volatile_store(T volatile * dst_ptr, T const volatile & src) +{ volatile_store(dst_ptr, &src); } + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION +void volatile_store(T volatile * dst_ptr, T const & src) +{ volatile_store(dst_ptr, &src); } + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION +T safe_load(T const * const ptr) +{ +#if !defined( __MIC__ ) + return *ptr; +#else + return volatile_load(ptr); +#endif +} + +} // namespace kokkos + +#undef KOKKOS_MAY_ALIAS + +#endif + + + diff --git a/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp new file mode 100755 index 0000000000000000000000000000000000000000..1d173fb4fb42b267953f57ef263bccb7f89f8297 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp @@ -0,0 +1,704 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#define DEBUG_PRINT 0 + +#include <iostream> +#include <sstream> + +#include <Kokkos_Macros.hpp> +#include <Kokkos_hwloc.hpp> +#include <impl/Kokkos_Error.hpp> + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace hwloc { + +/* Return 0 if asynchronous, 1 if synchronous and include process. */ +unsigned thread_mapping( const char * const label , + const bool allow_async , + unsigned & thread_count , + unsigned & use_numa_count , + unsigned & use_cores_per_numa , + std::pair<unsigned,unsigned> threads_coord[] ) +{ + const bool hwloc_avail = Kokkos::hwloc::available(); + const unsigned avail_numa_count = hwloc_avail ? hwloc::get_available_numa_count() : 1 ; + const unsigned avail_cores_per_numa = hwloc_avail ? hwloc::get_available_cores_per_numa() : thread_count ; + const unsigned avail_threads_per_core = hwloc_avail ? hwloc::get_available_threads_per_core() : 1 ; + + // (numa,core) coordinate of the process: + const std::pair<unsigned,unsigned> proc_coord = Kokkos::hwloc::get_this_thread_coordinate(); + + //------------------------------------------------------------------------ + // Defaults for unspecified inputs: + + if ( ! use_numa_count ) { + // Default to use all NUMA regions + use_numa_count = ! thread_count ? avail_numa_count : ( + thread_count < avail_numa_count ? thread_count : avail_numa_count ); + } + + if ( ! use_cores_per_numa ) { + // Default to use all but one core if asynchronous, all cores if synchronous. + const unsigned threads_per_numa = thread_count / use_numa_count ; + + use_cores_per_numa = ! threads_per_numa ? avail_cores_per_numa - ( allow_async ? 1 : 0 ) : ( + threads_per_numa < avail_cores_per_numa ? threads_per_numa : avail_cores_per_numa ); + } + + if ( ! thread_count ) { + thread_count = use_numa_count * use_cores_per_numa * avail_threads_per_core ; + } + + //------------------------------------------------------------------------ + // Input verification: + + const bool valid_numa = use_numa_count <= avail_numa_count ; + const bool valid_cores = use_cores_per_numa && + use_cores_per_numa <= avail_cores_per_numa ; + const bool valid_threads = thread_count && + thread_count <= use_numa_count * use_cores_per_numa * avail_threads_per_core ; + const bool balanced_numa = ! ( thread_count % use_numa_count ); + const bool balanced_cores = ! ( thread_count % ( use_numa_count * use_cores_per_numa ) ); + + const bool valid_input = valid_numa && valid_cores && valid_threads && balanced_numa && balanced_cores ; + + if ( ! valid_input ) { + + std::ostringstream msg ; + + msg << label << " HWLOC ERROR(s)" ; + + if ( ! valid_threads ) { + msg << " : thread_count(" << thread_count + << ") exceeds capacity(" + << use_numa_count * use_cores_per_numa * avail_threads_per_core + << ")" ; + } + if ( ! valid_numa ) { + msg << " : use_numa_count(" << use_numa_count + << ") exceeds capacity(" << avail_numa_count << ")" ; + } + if ( ! valid_cores ) { + msg << " : use_cores_per_numa(" << use_cores_per_numa + << ") exceeds capacity(" << avail_cores_per_numa << ")" ; + } + if ( ! balanced_numa ) { + msg << " : thread_count(" << thread_count + << ") imbalanced among numa(" << use_numa_count << ")" ; + } + if ( ! balanced_cores ) { + msg << " : thread_count(" << thread_count + << ") imbalanced among cores(" << use_numa_count * use_cores_per_numa << ")" ; + } + + Kokkos::Impl::throw_runtime_exception( msg.str() ); + } + + const unsigned thread_spawn_synchronous = + ( allow_async && + 1 < thread_count && + ( use_numa_count < avail_numa_count || + use_cores_per_numa < avail_cores_per_numa ) ) + ? 0 /* asyncronous */ + : 1 /* synchronous, threads_coord[0] is process core */ ; + + // Determine binding coordinates for to-be-spawned threads so that + // threads may be bound to cores as they are spawned. + + const unsigned threads_per_core = thread_count / ( use_numa_count * use_cores_per_numa ); + + if ( thread_spawn_synchronous ) { + // Working synchronously and include process core as threads_coord[0]. + // Swap the NUMA coordinate of the process core with 0 + // Swap the CORE coordinate of the process core with 0 + for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) { + const unsigned numa_coord = 0 == inuma ? proc_coord.first : ( proc_coord.first == inuma ? 0 : inuma ); + for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) { + const unsigned core_coord = 0 == icore ? proc_coord.second : ( proc_coord.second == icore ? 0 : icore ); + for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) { + threads_coord[i].first = numa_coord ; + threads_coord[i].second = core_coord ; + } + } + } + } + else if ( use_numa_count < avail_numa_count ) { + // Working asynchronously and omit the process' NUMA region from the pool. + // Swap the NUMA coordinate of the process core with ( ( avail_numa_count - use_numa_count ) - 1 ) + const unsigned numa_coord_swap = ( avail_numa_count - use_numa_count ) - 1 ; + for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) { + const unsigned numa_coord = proc_coord.first == inuma ? numa_coord_swap : inuma ; + for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) { + const unsigned core_coord = icore ; + for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) { + threads_coord[i].first = numa_coord ; + threads_coord[i].second = core_coord ; + } + } + } + } + else if ( use_cores_per_numa < avail_cores_per_numa ) { + // Working asynchronously and omit the process' core from the pool. + // Swap the CORE coordinate of the process core with ( ( avail_cores_per_numa - use_cores_per_numa ) - 1 ) + const unsigned core_coord_swap = ( avail_cores_per_numa - use_cores_per_numa ) - 1 ; + for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) { + const unsigned numa_coord = inuma ; + for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) { + const unsigned core_coord = proc_coord.second == icore ? core_coord_swap : icore ; + for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) { + threads_coord[i].first = numa_coord ; + threads_coord[i].second = core_coord ; + } + } + } + } + + return thread_spawn_synchronous ; +} + +} /* namespace hwloc */ +} /* namespace Kokkos */ + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +#if defined( KOKKOS_HAVE_HWLOC ) + +#include <iostream> +#include <sstream> +#include <stdexcept> + +/*--------------------------------------------------------------------------*/ +/* Third Party Libraries */ + +/* Hardware locality library: http://www.open-mpi.org/projects/hwloc/ */ +#include <hwloc.h> + +#define REQUIRED_HWLOC_API_VERSION 0x000010300 + +#if HWLOC_API_VERSION < REQUIRED_HWLOC_API_VERSION +#error "Requires http://www.open-mpi.org/projects/hwloc/ Version 1.3 or greater" +#endif + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace hwloc { +namespace { + +#if DEBUG_PRINT + +inline +void print_bitmap( std::ostream & s , const hwloc_const_bitmap_t bitmap ) +{ + s << "{" ; + for ( int i = hwloc_bitmap_first( bitmap ) ; + -1 != i ; i = hwloc_bitmap_next( bitmap , i ) ) { + s << " " << i ; + } + s << " }" ; +} + +#endif + +enum { MAX_CORE = 1024 }; + +std::pair<unsigned,unsigned> s_core_topology(0,0); +unsigned s_core_capacity(0); +hwloc_topology_t s_hwloc_topology(0); +hwloc_bitmap_t s_hwloc_location(0); +hwloc_bitmap_t s_process_binding(0); +hwloc_bitmap_t s_core[ MAX_CORE ]; + +struct Sentinel { + ~Sentinel(); + Sentinel(); +}; + +bool sentinel() +{ + static Sentinel self ; + + if ( 0 == s_hwloc_topology ) { + std::cerr << "Kokkos::hwloc ERROR : Called after return from main()" << std::endl ; + std::cerr.flush(); + } + + return 0 != s_hwloc_topology ; +} + +Sentinel::~Sentinel() +{ + hwloc_topology_destroy( s_hwloc_topology ); + hwloc_bitmap_free( s_process_binding ); + hwloc_bitmap_free( s_hwloc_location ); + + s_core_topology.first = 0 ; + s_core_topology.second = 0 ; + s_core_capacity = 0 ; + s_hwloc_topology = 0 ; + s_hwloc_location = 0 ; + s_process_binding = 0 ; +} + +Sentinel::Sentinel() +{ +#if defined(__MIC__) + static const bool remove_core_0 = true ; +#else + static const bool remove_core_0 = false ; +#endif + + s_core_topology = std::pair<unsigned,unsigned>(0,0); + s_core_capacity = 0 ; + s_hwloc_topology = 0 ; + s_hwloc_location = 0 ; + s_process_binding = 0 ; + + for ( unsigned i = 0 ; i < MAX_CORE ; ++i ) s_core[i] = 0 ; + + hwloc_topology_init( & s_hwloc_topology ); + hwloc_topology_load( s_hwloc_topology ); + + s_hwloc_location = hwloc_bitmap_alloc(); + s_process_binding = hwloc_bitmap_alloc(); + + hwloc_get_cpubind( s_hwloc_topology , s_process_binding , HWLOC_CPUBIND_PROCESS ); + + if ( remove_core_0 ) { + + const hwloc_obj_t core = hwloc_get_obj_by_type( s_hwloc_topology , HWLOC_OBJ_CORE , 0 ); + + if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) { + + hwloc_bitmap_t s_process_no_core_zero = hwloc_bitmap_alloc(); + + hwloc_bitmap_andnot( s_process_no_core_zero , s_process_binding , core->allowed_cpuset ); + + bool ok = 0 == hwloc_set_cpubind( s_hwloc_topology , + s_process_no_core_zero , + HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT ); + + if ( ok ) { + hwloc_get_cpubind( s_hwloc_topology , s_process_binding , HWLOC_CPUBIND_PROCESS ); + + ok = 0 != hwloc_bitmap_isequal( s_process_binding , s_process_no_core_zero ); + } + + hwloc_bitmap_free( s_process_no_core_zero ); + + if ( ! ok ) { + std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ; + } + } + } + + // Choose a hwloc object type for the NUMA level, which may not exist. + + hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX ; + + { + // Object types to search, in order. + static const hwloc_obj_type_t candidate_root_type[] = + { HWLOC_OBJ_NODE /* NUMA region */ + , HWLOC_OBJ_SOCKET /* hardware socket */ + , HWLOC_OBJ_MACHINE /* local machine */ + }; + + enum { CANDIDATE_ROOT_TYPE_COUNT = + sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) }; + + for ( int k = 0 ; k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type ; ++k ) { + if ( 0 < hwloc_get_nbobjs_by_type( s_hwloc_topology , candidate_root_type[k] ) ) { + root_type = candidate_root_type[k] ; + } + } + } + + // Determine which of these 'root' types are available to this process. + // The process may have been bound (e.g., by MPI) to a subset of these root types. + // Determine current location of the master (calling) process> + + hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc(); + + hwloc_get_last_cpu_location( s_hwloc_topology , proc_cpuset_location , HWLOC_CPUBIND_THREAD ); + + const unsigned max_root = hwloc_get_nbobjs_by_type( s_hwloc_topology , root_type ); + + unsigned root_base = max_root ; + unsigned root_count = 0 ; + unsigned core_per_root = 0 ; + unsigned pu_per_core = 0 ; + bool symmetric = true ; + + for ( unsigned i = 0 ; i < max_root ; ++i ) { + + const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , i ); + + if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) { + + ++root_count ; + + // Remember which root (NUMA) object the master thread is running on. + // This will be logical NUMA rank #0 for this process. + + if ( hwloc_bitmap_intersects( proc_cpuset_location, root->allowed_cpuset ) ) { + root_base = i ; + } + + // Count available cores: + + const unsigned max_core = + hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology , + root->allowed_cpuset , + HWLOC_OBJ_CORE ); + + unsigned core_count = 0 ; + + for ( unsigned j = 0 ; j < max_core ; ++j ) { + + const hwloc_obj_t core = + hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology , + root->allowed_cpuset , + HWLOC_OBJ_CORE , j ); + + // If process' cpuset intersects core's cpuset then process can access this core. + // Must use intersection instead of inclusion because the Intel-Phi + // MPI may bind the process to only one of the core's hyperthreads. + // + // Assumption: if the process can access any hyperthread of the core + // then it has ownership of the entire core. + // This assumes that it would be performance-detrimental + // to spawn more than one MPI process per core and use nested threading. + + if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) { + + ++core_count ; + + const unsigned pu_count = + hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology , + core->allowed_cpuset , + HWLOC_OBJ_PU ); + + if ( pu_per_core == 0 ) pu_per_core = pu_count ; + + // Enforce symmetry by taking the minimum: + + pu_per_core = std::min( pu_per_core , pu_count ); + + if ( pu_count != pu_per_core ) symmetric = false ; + } + } + + if ( 0 == core_per_root ) core_per_root = core_count ; + + // Enforce symmetry by taking the minimum: + + core_per_root = std::min( core_per_root , core_count ); + + if ( core_count != core_per_root ) symmetric = false ; + } + } + + s_core_topology.first = root_count ; + s_core_topology.second = core_per_root ; + s_core_capacity = pu_per_core ; + + // Fill the 's_core' array for fast mapping from a core coordinate to the + // hwloc cpuset object required for thread location querying and binding. + + for ( unsigned i = 0 ; i < max_root ; ++i ) { + + const unsigned root_rank = ( i + root_base ) % max_root ; + + const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , root_rank ); + + if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) { + + const unsigned max_core = + hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology , + root->allowed_cpuset , + HWLOC_OBJ_CORE ); + + unsigned core_count = 0 ; + + for ( unsigned j = 0 ; j < max_core && core_count < core_per_root ; ++j ) { + + const hwloc_obj_t core = + hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology , + root->allowed_cpuset , + HWLOC_OBJ_CORE , j ); + + if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) { + + s_core[ core_count + core_per_root * i ] = core->allowed_cpuset ; + + ++core_count ; + } + } + } + } + + hwloc_bitmap_free( proc_cpuset_location ); + + if ( ! symmetric ) { + std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology." + << std::endl ; + } +} + + +} // namespace + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +bool available() +{ return true ; } + +unsigned get_available_numa_count() +{ sentinel(); return s_core_topology.first ; } + +unsigned get_available_cores_per_numa() +{ sentinel(); return s_core_topology.second ; } + +unsigned get_available_threads_per_core() +{ sentinel(); return s_core_capacity ; } + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +unsigned bind_this_thread( + const unsigned coordinate_count , + std::pair<unsigned,unsigned> coordinate[] ) +{ + unsigned i = 0 ; + + try { + const std::pair<unsigned,unsigned> current = get_this_thread_coordinate(); + + // Match one of the requests: + for ( i = 0 ; i < coordinate_count && current != coordinate[i] ; ++i ); + + if ( coordinate_count == i ) { + // Match the first request (typically NUMA): + for ( i = 0 ; i < coordinate_count && current.first != coordinate[i].first ; ++i ); + } + + if ( coordinate_count == i ) { + // Match any unclaimed request: + for ( i = 0 ; i < coordinate_count && ~0u == coordinate[i].first ; ++i ); + } + + if ( coordinate_count == i || ! bind_this_thread( coordinate[i] ) ) { + // Failed to bind: + i = ~0u ; + } + + if ( i < coordinate_count ) { + +#if DEBUG_PRINT + if ( current != coordinate[i] ) { + std::cout << " bind_this_thread: rebinding from (" + << current.first << "," + << current.second + << ") to (" + << coordinate[i].first << "," + << coordinate[i].second + << ")" << std::endl ; + } +#endif + + coordinate[i].first = ~0u ; + coordinate[i].second = ~0u ; + } + } + catch( ... ) { + i = ~0u ; + } + + return i ; +} + + +bool bind_this_thread( const std::pair<unsigned,unsigned> coord ) +{ + if ( ! sentinel() ) return false ; + +#if DEBUG_PRINT + + std::cout << "Kokkos::bind_this_thread() at " ; + + hwloc_get_last_cpu_location( s_hwloc_topology , + s_hwloc_location , HWLOC_CPUBIND_THREAD ); + + print_bitmap( std::cout , s_hwloc_location ); + + std::cout << " to " ; + + print_bitmap( std::cout , s_core[ coord.second + coord.first * s_core_topology.second ] ); + + std::cout << std::endl ; + +#endif + + // As safe and fast as possible. + // Fast-lookup by caching the coordinate -> hwloc cpuset mapping in 's_core'. + return coord.first < s_core_topology.first && + coord.second < s_core_topology.second && + 0 == hwloc_set_cpubind( s_hwloc_topology , + s_core[ coord.second + coord.first * s_core_topology.second ] , + HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT ); +} + +bool unbind_this_thread() +{ + if ( ! sentinel() ) return false ; + +#define HWLOC_DEBUG_PRINT 0 + +#if HWLOC_DEBUG_PRINT + + std::cout << "Kokkos::unbind_this_thread() from " ; + + hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD ); + + print_bitmap( std::cout , s_hwloc_location ); + +#endif + + const bool result = + s_hwloc_topology && + 0 == hwloc_set_cpubind( s_hwloc_topology , + s_process_binding , + HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT ); + +#if HWLOC_DEBUG_PRINT + + std::cout << " to " ; + + hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD ); + + print_bitmap( std::cout , s_hwloc_location ); + + std::cout << std::endl ; + +#endif + + return result ; + +#undef HWLOC_DEBUG_PRINT + +} + +//---------------------------------------------------------------------------- + +std::pair<unsigned,unsigned> get_this_thread_coordinate() +{ + std::pair<unsigned,unsigned> coord(0u,0u); + + if ( ! sentinel() ) return coord ; + + const unsigned n = s_core_topology.first * s_core_topology.second ; + + // Using the pre-allocated 's_hwloc_location' to avoid memory + // allocation by this thread. This call is NOT thread-safe. + hwloc_get_last_cpu_location( s_hwloc_topology , + s_hwloc_location , HWLOC_CPUBIND_THREAD ); + + unsigned i = 0 ; + + while ( i < n && ! hwloc_bitmap_intersects( s_hwloc_location , s_core[ i ] ) ) ++i ; + + if ( i < n ) { + coord.first = i / s_core_topology.second ; + coord.second = i % s_core_topology.second ; + } + + return coord ; +} + +//---------------------------------------------------------------------------- + +} /* namespace hwloc */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#else /* ! defined( KOKKOS_HAVE_HWLOC ) */ + +namespace Kokkos { +namespace hwloc { + +bool available() { return false ; } + +unsigned get_available_numa_count() { return 1 ; } +unsigned get_available_cores_per_numa() { return 1 ; } +unsigned get_available_threads_per_core() { return 1 ; } + +unsigned bind_this_thread( const unsigned , std::pair<unsigned,unsigned>[] ) +{ return ~0 ; } + +bool bind_this_thread( const std::pair<unsigned,unsigned> ) +{ return false ; } + +bool unbind_this_thread() +{ return true ; } + +std::pair<unsigned,unsigned> get_this_thread_coordinate() +{ return std::pair<unsigned,unsigned>(0,0); } + +} // namespace hwloc +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif + + diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp new file mode 100755 index 0000000000000000000000000000000000000000..abd845da9123d1f1b659faa1d5c167b9528f4fe4 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp @@ -0,0 +1,82 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_spinwait.hpp> + +/*--------------------------------------------------------------------------*/ + +#if ( KOKKOS_ENABLE_ASM ) + #if defined( __arm__ ) + /* No-operation instruction to idle the thread. */ + #define YIELD asm volatile("nop") + #else + /* Pause instruction to prevent excess processor bus usage */ + #define YIELD asm volatile("pause\n":::"memory") + #endif +#elif defined ( KOKKOS_HAVE_WINTHREAD ) + #include <process.h> + #define YIELD Sleep(0) +#elif defined ( _WIN32 ) + #define YIELD __asm__ __volatile__("pause\n":::"memory") +#else + #include <sched.h> + #define YIELD sched_yield() +#endif + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) +void spinwait( volatile int & flag , const int value ) +{ + while ( value == flag ) { + YIELD ; + } +} +#endif + +} /* namespace Impl */ +} /* namespace Kokkos */ + diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp new file mode 100755 index 0000000000000000000000000000000000000000..cc87771faefcb8ad7716842890dbec4a9c1219a1 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp @@ -0,0 +1,64 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#ifndef KOKKOS_SPINWAIT_HPP +#define KOKKOS_SPINWAIT_HPP + +#include <Kokkos_Macros.hpp> + +namespace Kokkos { +namespace Impl { + +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) +void spinwait( volatile int & flag , const int value ); +#else +KOKKOS_INLINE_FUNCTION +void spinwait( volatile int & , const int ) {} +#endif + +} /* namespace Impl */ +} /* namespace Kokkos */ + +#endif /* #ifndef KOKKOS_SPINWAIT_HPP */ + diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..b2d3d55066406c6911929ce3659f0a9e50187c2a --- /dev/null +++ b/lib/kokkos/core/unit_test/Makefile @@ -0,0 +1,146 @@ +KOKKOS_PATH = ../.. + +GTEST_PATH = ../../TPL/gtest + +vpath %.cpp ${KOKKOS_PATH}/core/unit_test +TEST_HEADERS = $(wildcard $(KOKKOS_PATH)/core/unit_test/*.hpp) + +default: build_all + echo "End Build" + + +include $(KOKKOS_PATH)/Makefile.kokkos + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + CXX = nvcc_wrapper + CXXFLAGS ?= -O3 + LINK = $(CXX) + LDFLAGS ?= -lpthread +else + CXX ?= g++ + CXXFLAGS ?= -O3 + LINK ?= $(CXX) + LDFLAGS ?= -lpthread +endif + +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/unit_test + +TEST_TARGETS = +TARGETS = + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o + TARGETS += KokkosCore_UnitTest_Cuda + TEST_TARGETS += test-cuda +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o + TARGETS += KokkosCore_UnitTest_Threads + TEST_TARGETS += test-threads +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + OBJ_OPENMP = TestOpenMP.o UnitTestMain.o gtest-all.o + TARGETS += KokkosCore_UnitTest_OpenMP + TEST_TARGETS += test-openmp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o + TARGETS += KokkosCore_UnitTest_Serial + TEST_TARGETS += test-serial +endif + +ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1) + OBJ_QTHREAD = TestQthread.o UnitTestMain.o gtest-all.o + TARGETS += KokkosCore_UnitTest_Qthread + TEST_TARGETS += test-qthread +endif + +OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o +TARGETS += KokkosCore_UnitTest_HWLOC +TEST_TARGETS += test-hwloc + +OBJ_ALLOCATIONTRACKER = TestAllocationTracker.o UnitTestMain.o gtest-all.o +TARGETS += KokkosCore_UnitTest_AllocationTracker +TEST_TARGETS += test-allocationtracker + +OBJ_DEFAULT = TestDefaultDeviceType.o UnitTestMain.o gtest-all.o +TARGETS += KokkosCore_UnitTest_Default +TEST_TARGETS += test-default + +OBJ_DEFAULTINIT = TestDefaultDeviceTypeInit.o UnitTestMain.o gtest-all.o +TARGETS += KokkosCore_UnitTest_DefaultInit +TEST_TARGETS += test-default-init + + +KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Cuda + +KokkosCore_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Threads + +KokkosCore_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_OpenMP + +KokkosCore_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Serial + +KokkosCore_UnitTest_Qthread: $(OBJ_QTHREAD) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREAD) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthread + +KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_HWLOC + +KokkosCore_UnitTest_AllocationTracker: $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_AllocationTracker + +KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Default + +KokkosCore_UnitTest_DefaultInit: $(OBJ_DEFAULTINIT) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_DEFAULTINIT) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultInit + +test-cuda: KokkosCore_UnitTest_Cuda + ./KokkosCore_UnitTest_Cuda + +test-threads: KokkosCore_UnitTest_Threads + ./KokkosCore_UnitTest_Threads + +test-openmp: KokkosCore_UnitTest_OpenMP + ./KokkosCore_UnitTest_OpenMP + +test-serial: KokkosCore_UnitTest_Serial + ./KokkosCore_UnitTest_Serial + +test-qthread: KokkosCore_UnitTest_Qthread + ./KokkosCore_UnitTest_Qthread + +test-hwloc: KokkosCore_UnitTest_HWLOC + ./KokkosCore_UnitTest_HWLOC + +test-allocationtracker: KokkosCore_UnitTest_AllocationTracker + ./KokkosCore_UnitTest_AllocationTracker + +test-default: KokkosCore_UnitTest_Default + ./KokkosCore_UnitTest_Default + +test-default-init: KokkosCore_UnitTest_DefaultInit + ./KokkosCore_UnitTest_DefaultInit + +build_all: $(TARGETS) + +test: $(TEST_TARGETS) + +clean: kokkos-clean + rm -f *.o $(TARGETS) + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(TEST_HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< + +gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc + diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp new file mode 100755 index 0000000000000000000000000000000000000000..35e7a8930d81115b99b8f7e7fad4258a22c204ca --- /dev/null +++ b/lib/kokkos/core/unit_test/TestAggregate.hpp @@ -0,0 +1,716 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_AGGREGATE_HPP +#define TEST_AGGREGATE_HPP + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +struct EmbedArray {}; + +struct ArrayProxyContiguous {}; +struct ArrayProxyStrided {}; + +template< typename T , unsigned N = 0 , class Proxy = void > +struct Array ; + +template< typename T > +struct Array<T,0,ArrayProxyContiguous> +{ +public: + typedef T value_type ; + + enum { StaticLength = 0 }; + T * const value ; + const unsigned count ; + + KOKKOS_INLINE_FUNCTION + Array( T * v , unsigned n ) : value(v), count(n) {} + + template< class Proxy > + KOKKOS_INLINE_FUNCTION + Array & operator = ( const Array<T,0,Proxy> & rhs ) { return *this ; } +}; + +template< typename T , unsigned N > +struct Array<T,N,ArrayProxyContiguous> +{ +public: + typedef T value_type ; + + enum { StaticLength = N }; + T * const value ; + + KOKKOS_INLINE_FUNCTION + Array( T * v , unsigned ) : value(v) {} + + template< class Proxy > + KOKKOS_INLINE_FUNCTION + Array & operator = ( const Array<T,N,Proxy> & rhs ) { return *this ; } +}; + +template< typename T , unsigned N > +struct Array<T,N,ArrayProxyStrided> +{ +public: + typedef T value_type ; + + enum { StaticLength = N }; + T * const value ; + const unsigned stride ; + + KOKKOS_INLINE_FUNCTION + Array( T * v , unsigned , unsigned s ) : value(v), stride(s) {} + + template< class Proxy > + KOKKOS_INLINE_FUNCTION + Array & operator = ( const Array<T,N,Proxy> & rhs ) { return *this ; } +}; + +template< typename T > +struct Array<T,0,ArrayProxyStrided> +{ +public: + typedef T value_type ; + + enum { StaticLength = 0 }; + T * const value ; + const unsigned count ; + const unsigned stride ; + + KOKKOS_INLINE_FUNCTION + Array( T * v , unsigned n , unsigned s ) : value(v), count(n), stride(s) {} + + template< class Proxy > + KOKKOS_INLINE_FUNCTION + Array & operator = ( const Array<T,0,Proxy> & rhs ) { return *this ; } +}; + +template< typename T > +struct Array<T,0,void> +{ +public: + typedef T value_type ; + + enum { StaticLength = 0 }; + T * value ; + const unsigned count ; + + KOKKOS_INLINE_FUNCTION + Array() : value(0) , count(0) {} + + template< unsigned N , class Proxy > + KOKKOS_INLINE_FUNCTION + Array( const Array<T,N,Proxy> & rhs ) : value(rhs.value), count(N) {} +}; + +template< typename T , unsigned N > +struct Array<T,N,void> +{ +public: + typedef T value_type ; + + enum { StaticLength = N }; + T value[N] ; + + template< class Proxy > + KOKKOS_INLINE_FUNCTION + Array & operator = ( const Array<T,N,Proxy> & ) { return *this ; } +}; + +} // namespace Test + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + +namespace Kokkos { +namespace Impl { + +template< typename T , unsigned N > +struct AnalyzeShape< Test::Array< T , N > > + : public ShapeInsert< typename AnalyzeShape< T >::shape , N >::type +{ +private: + typedef AnalyzeShape< T > nested ; +public: + + typedef Test::EmbedArray specialize ; + + typedef typename ShapeInsert< typename nested::shape , N >::type shape ; + + typedef typename nested::array_intrinsic_type array_intrinsic_type[ N ]; + typedef Test::Array< T , N > value_type ; + typedef Test::Array< T , N > type ; + + typedef const array_intrinsic_type const_array_intrinsic_type ; + typedef const value_type const_value_type ; + typedef const type const_type ; + + typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type[ N ]; + typedef Test::Array< typename nested::non_const_value_type , N > non_const_value_type ; + typedef Test::Array< typename nested::non_const_value_type , N > non_const_type ; +}; + +template< typename T > +struct AnalyzeShape< Test::Array< T , 0 > > + : public ShapeInsert< typename AnalyzeShape< T >::shape , 0 >::type +{ +private: + typedef AnalyzeShape< T > nested ; +public: + + typedef Test::EmbedArray specialize ; + + typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ; + + typedef typename nested::array_intrinsic_type * array_intrinsic_type ; + typedef Test::Array< T , 0 > value_type ; + typedef Test::Array< T , 0 > type ; + + typedef const array_intrinsic_type const_array_intrinsic_type ; + typedef const value_type const_value_type ; + typedef const type const_type ; + + typedef typename nested::non_const_array_intrinsic_type * non_const_array_intrinsic_type ; + typedef Test::Array< typename nested::non_const_value_type , 0 > non_const_value_type ; + typedef Test::Array< typename nested::non_const_value_type , 0 > non_const_type ; +}; + +/*--------------------------------------------------------------------------*/ + +template< class ValueType , class MemorySpace , class MemoryTraits > +struct ViewSpecialize< ValueType + , Test::EmbedArray + , LayoutLeft + , MemorySpace + , MemoryTraits > +{ typedef Test::EmbedArray type ; }; + +template< class ValueType , class MemorySpace , class MemoryTraits > +struct ViewSpecialize< ValueType + , Test::EmbedArray + , LayoutRight + , MemorySpace + , MemoryTraits > +{ typedef Test::EmbedArray type ; }; + +/*--------------------------------------------------------------------------*/ + +template<> +struct ViewAssignment< Test::EmbedArray , Test::EmbedArray , void > +{ + //------------------------------------ + /** \brief Compatible value and shape */ + + template< class DT , class DL , class DD , class DM , + class ST , class SL , class SD , class SM > + KOKKOS_INLINE_FUNCTION + ViewAssignment( View<DT,DL,DD,DM,Test::EmbedArray> & dst + , const View<ST,SL,SD,SM,Test::EmbedArray> & src + , const typename enable_if<( + ViewAssignable< ViewTraits<DT,DL,DD,DM> , + ViewTraits<ST,SL,SD,SM> >::value + )>::type * = 0 + ) + { + dst.m_offset_map.assign( src.m_offset_map ); + + dst.m_ptr_on_device = src.m_ptr_on_device ; + + dst.m_tracker = src.m_tracker; + } +}; + +template<> +struct ViewAssignment< ViewDefault , Test::EmbedArray , void > +{ + //------------------------------------ + /** \brief Compatible value and shape */ + + template< class ST , class SL , class SD , class SM > + KOKKOS_INLINE_FUNCTION + ViewAssignment( typename View<ST,SL,SD,SM,Test::EmbedArray>::array_type & dst + , const View<ST,SL,SD,SM,Test::EmbedArray> & src + ) + { + dst.m_offset_map.assign( src.m_offset_map ); + + dst.m_ptr_on_device = src.m_ptr_on_device ; + + dst.m_tracker = src.m_tracker; + } +}; + + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +template< class DataType , + class Arg1Type , + class Arg2Type , + class Arg3Type > +class View< DataType , Arg1Type , Arg2Type , Arg3Type , Test::EmbedArray > + : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > +{ +public: + + typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ; + +private: + + // Assignment of compatible views requirement: + template< class , class , class , class , class > friend class View ; + + // Assignment of compatible subview requirement: + template< class , class , class > friend struct Impl::ViewAssignment ; + + typedef Impl::ViewOffset< typename traits::shape_type , + typename traits::array_layout > offset_map_type ; + + typedef Impl::ViewDataManagement< traits > view_data_management ; + + // traits::value_type = Test::Array< T , N > + + typename traits::value_type::value_type * m_ptr_on_device ; + offset_map_type m_offset_map ; + view_data_management m_management ; + Impl::AllocationTracker m_tracker ; + +public: + + typedef View< typename traits::array_intrinsic_type , + typename traits::array_layout , + typename traits::execution_space , + typename traits::memory_traits > array_type ; + + typedef View< typename traits::non_const_data_type , + typename traits::array_layout , + typename traits::execution_space , + typename traits::memory_traits > non_const_type ; + + typedef View< typename traits::const_data_type , + typename traits::array_layout , + typename traits::execution_space , + typename traits::memory_traits > const_type ; + + typedef View< typename traits::non_const_data_type , + typename traits::array_layout , + typename traits::host_mirror_space , + void > HostMirror ; + + //------------------------------------ + // Shape + + enum { Rank = traits::rank - 1 }; + + KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_offset_map ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_offset_map.N0 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; } + KOKKOS_INLINE_FUNCTION typename traits::size_type size() const + { + return m_offset_map.N0 + * m_offset_map.N1 + * m_offset_map.N2 + * m_offset_map.N3 + * m_offset_map.N4 + * m_offset_map.N5 + * m_offset_map.N6 + * m_offset_map.N7 + ; + } + + template< typename iType > + KOKKOS_INLINE_FUNCTION + typename traits::size_type dimension( const iType & i ) const + { return Impl::dimension( m_offset_map , i ); } + + //------------------------------------ + // Destructor, constructors, assignment operators: + + KOKKOS_INLINE_FUNCTION + ~View() {} + + KOKKOS_INLINE_FUNCTION + View() + : m_ptr_on_device(0) + , m_offset_map() + , m_management() + , m_tracker() + { m_offset_map.assing(0,0,0,0,0,0,0,0); } + + KOKKOS_INLINE_FUNCTION + View( const View & rhs ) + : m_ptr_on_device(0) + , m_offset_map() + , m_management() + , m_tracker() + { + (void) Impl::ViewAssignment< + typename traits::specialize , + typename traits::specialize >( *this , rhs ); + } + + KOKKOS_INLINE_FUNCTION + View & operator = ( const View & rhs ) + { + (void) Impl::ViewAssignment< + typename traits::specialize , + typename traits::specialize >( *this , rhs ); + return *this ; + } + + //------------------------------------ + // Construct or assign compatible view: + + template< class RT , class RL , class RD , class RM , class RS > + KOKKOS_INLINE_FUNCTION + View( const View<RT,RL,RD,RM,RS> & rhs ) + : m_ptr_on_device(0) + , m_offset_map() + , m_management() + , m_tracker() + { + (void) Impl::ViewAssignment< + typename traits::specialize , RS >( *this , rhs ); + } + + template< class RT , class RL , class RD , class RM , class RS > + KOKKOS_INLINE_FUNCTION + View & operator = ( const View<RT,RL,RD,RM,RS> & rhs ) + { + (void) Impl::ViewAssignment< + typename traits::specialize , RS >( *this , rhs ); + return *this ; + } + + //------------------------------------ + // Allocation of a managed view with possible alignment padding. + + template< class AllocationProperties > + explicit inline + View( const AllocationProperties & prop , + const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type n0 = 0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 ) + : m_ptr_on_device(0) + , m_offset_map() + , m_management() + , m_tracker() + { + typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ; + + typedef typename traits::memory_space memory_space ; + typedef typename traits::value_type::value_type scalar_type ; + + m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); + m_offset_map.set_padding(); + + m_tracker = memory_space::allocate_and_track( Alloc::label( prop ), sizeof(scalar_type) * m_offset_map.capacity() ); + + m_ptr_on_device = reinterpret_cast<scalar_type *>(m_tracker.alloc_ptr()); + + (void) Impl::ViewDefaultConstruct< typename traits::execution_space , scalar_type , Alloc::Initialize >( m_ptr_on_device , m_offset_map.capacity() ); + } + + //------------------------------------ + // Assign an unmanaged View from pointer, can be called in functors. + // No alignment padding is performed. + + typedef Impl::if_c< ! traits::is_managed , + typename traits::value_type::value_type * , + Impl::ViewError::user_pointer_constructor_requires_unmanaged > + if_user_pointer_constructor ; + + View( typename if_user_pointer_constructor::type ptr , + const size_t n0 = 0 , + const size_t n1 = 0 , + const size_t n2 = 0 , + const size_t n3 = 0 , + const size_t n4 = 0 , + const size_t n5 = 0 , + const size_t n6 = 0 , + const size_t n7 = 0 ) + : m_ptr_on_device(0) + , m_offset_map() + , m_management() + , m_tracker() + { + m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); + m_ptr_on_device = if_user_pointer_constructor::select( ptr ); + m_management.set_unmanaged(); + } + + //------------------------------------ + // Assign unmanaged View to portion of Device shared memory + + typedef Impl::if_c< ! traits::is_managed , + typename traits::execution_space , + Impl::ViewError::device_shmem_constructor_requires_unmanaged > + if_device_shmem_constructor ; + + explicit KOKKOS_INLINE_FUNCTION + View( typename if_device_shmem_constructor::type & dev , + const unsigned n0 = 0 , + const unsigned n1 = 0 , + const unsigned n2 = 0 , + const unsigned n3 = 0 , + const unsigned n4 = 0 , + const unsigned n5 = 0 , + const unsigned n6 = 0 , + const unsigned n7 = 0 ) + : m_ptr_on_device(0) + , m_offset_map() + , m_management() + , m_tracker() + { + typedef typename traits::value_type::value_type scalar_type ; + + enum { align = 8 }; + enum { mask = align - 1 }; + + typedef Impl::if_c< ! traits::is_managed , + scalar_type * , + Impl::ViewError::device_shmem_constructor_requires_unmanaged > + if_device_shmem_pointer ; + + m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); + + // Select the first argument: + m_ptr_on_device = if_device_shmem_pointer::select( + (scalar_type *) dev.get_shmem( unsigned( sizeof(scalar_type) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) ); + } + + static inline + unsigned shmem_size( const unsigned n0 = 0 , + const unsigned n1 = 0 , + const unsigned n2 = 0 , + const unsigned n3 = 0 , + const unsigned n4 = 0 , + const unsigned n5 = 0 , + const unsigned n6 = 0 , + const unsigned n7 = 0 ) + { + enum { align = 8 }; + enum { mask = align - 1 }; + + typedef typename traits::value_type::value_type scalar_type ; + + offset_map_type offset_map ; + + offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 ); + + return unsigned( sizeof(scalar_type) * offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ; + } + + //------------------------------------ + // Is not allocated + + KOKKOS_INLINE_FUNCTION + bool is_null() const { return 0 == m_ptr_on_device ; } + + //------------------------------------ + // LayoutLeft, rank 2: + + typedef Test::Array< typename traits::value_type::value_type , + traits::value_type::StaticLength , + Test::ArrayProxyStrided > LeftValue ; + + template< typename iType0 > + KOKKOS_INLINE_FUNCTION + typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type + operator[] ( const iType0 & i0 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); + + return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 ); + } + + template< typename iType0 > + KOKKOS_INLINE_FUNCTION + typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type + operator() ( const iType0 & i0 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); + + return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 ); + } + + template< typename iType0 > + KOKKOS_INLINE_FUNCTION + typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type + at( const iType0 & i0 , const int , const int , const int , + const int , const int , const int , const int ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); + + return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 ); + } + + //------------------------------------ + // LayoutRight, rank 2: + + typedef Test::Array< typename traits::value_type::value_type , + traits::value_type::StaticLength , + Test::ArrayProxyContiguous > RightValue ; + + template< typename iType0 > + KOKKOS_INLINE_FUNCTION + typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type + operator[] ( const iType0 & i0 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); + + return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 ); + } + + template< typename iType0 > + KOKKOS_INLINE_FUNCTION + typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type + operator() ( const iType0 & i0 ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); + + return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 ); + } + + template< typename iType0 > + KOKKOS_INLINE_FUNCTION + typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type + at( const iType0 & i0 , const int , const int , const int , + const int , const int , const int , const int ) const + { + KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 ); + KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device ); + + return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 ); + } + + //------------------------------------ + // Access to the underlying contiguous storage of this view specialization. + // These methods are specific to specialization of a view. + + KOKKOS_INLINE_FUNCTION + typename traits::value_type::value_type * ptr_on_device() const { return m_ptr_on_device ; } + + // Stride of physical storage, dimensioned to at least Rank + template< typename iType > + KOKKOS_INLINE_FUNCTION + void stride( iType * const s ) const + { m_offset_map.stride( s ); } + + // Count of contiguously allocated data members including padding. + KOKKOS_INLINE_FUNCTION + typename traits::size_type capacity() const + { return m_offset_map.capacity(); } +}; + +} // namespace Kokkos + +#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */ + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Test { + +template< class DeviceType > +int TestViewAggregate() +{ + +#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + + typedef Kokkos::View< Test::Array<double,32> * , DeviceType > a32_type ; + typedef typename a32_type::array_type a32_base_type ; + + typedef Kokkos::View< Test::Array<double> * , DeviceType > a0_type ; + typedef typename a0_type::array_type a0_base_type ; + + a32_type a32("a32",100); + a32_base_type a32_base ; + + a0_type a0("a0",100,32); + a0_base_type a0_base ; + + a32_base = a32 ; + a0_base = a0 ; + +#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */ + + return 0 ; +} + +} + + +#endif /* #ifndef TEST_AGGREGATE_HPP */ diff --git a/lib/kokkos/core/unit_test/TestAggregateReduction.hpp b/lib/kokkos/core/unit_test/TestAggregateReduction.hpp new file mode 100755 index 0000000000000000000000000000000000000000..7175d34348f4f7f7b1db353fd470635aa77a4341 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestAggregateReduction.hpp @@ -0,0 +1,189 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_AGGREGATE_REDUCTION_HPP +#define TEST_AGGREGATE_REDUCTION_HPP + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +namespace Test { + +template< typename T , unsigned N > +struct StaticArray { + T value[N] ; + + KOKKOS_INLINE_FUNCTION + StaticArray() + { for ( unsigned i = 0 ; i < N ; ++i ) value[i] = T(); } + + KOKKOS_INLINE_FUNCTION + StaticArray( const StaticArray & rhs ) + { for ( unsigned i = 0 ; i < N ; ++i ) value[i] = rhs.value[i]; } + + KOKKOS_INLINE_FUNCTION + operator T () { return value[0]; } + + KOKKOS_INLINE_FUNCTION + StaticArray & operator = ( const T & rhs ) + { + for ( unsigned i = 0 ; i < N ; ++i ) value[i] = rhs ; + return *this ; + } + + KOKKOS_INLINE_FUNCTION + StaticArray & operator = ( const StaticArray & rhs ) + { + for ( unsigned i = 0 ; i < N ; ++i ) value[i] = rhs.value[i] ; + return *this ; + } + + KOKKOS_INLINE_FUNCTION + StaticArray operator * ( const StaticArray & rhs ) + { + StaticArray tmp ; + for ( unsigned i = 0 ; i < N ; ++i ) tmp.value[i] = value[i] * rhs.value[i] ; + return tmp ; + } + + KOKKOS_INLINE_FUNCTION + StaticArray operator + ( const StaticArray & rhs ) + { + StaticArray tmp ; + for ( unsigned i = 0 ; i < N ; ++i ) tmp.value[i] = value[i] + rhs.value[i] ; + return tmp ; + } + + KOKKOS_INLINE_FUNCTION + StaticArray & operator += ( const StaticArray & rhs ) + { + for ( unsigned i = 0 ; i < N ; ++i ) value[i] += rhs.value[i] ; + return *this ; + } + + KOKKOS_INLINE_FUNCTION + void operator += ( const volatile StaticArray & rhs ) volatile + { + for ( unsigned i = 0 ; i < N ; ++i ) value[i] += rhs.value[i] ; + } +}; + +template< typename T , class Space > +struct DOT { + typedef T value_type ; + typedef Space execution_space ; + + Kokkos::View< value_type * , Space > a ; + Kokkos::View< value_type * , Space > b ; + + DOT( const Kokkos::View< value_type * , Space > arg_a + , const Kokkos::View< value_type * , Space > arg_b + ) + : a( arg_a ), b( arg_b ) {} + + KOKKOS_INLINE_FUNCTION + void operator()( const int i , value_type & update ) const + { + update += a(i) * b(i); + } +}; + +template< typename T , class Space > +struct FILL { + typedef T value_type ; + typedef Space execution_space ; + + Kokkos::View< value_type * , Space > a ; + Kokkos::View< value_type * , Space > b ; + + FILL( const Kokkos::View< value_type * , Space > & arg_a + , const Kokkos::View< value_type * , Space > & arg_b + ) + : a( arg_a ), b( arg_b ) {} + + KOKKOS_INLINE_FUNCTION + void operator()( const int i ) const + { + a(i) = i % 2 ? i + 1 : 1 ; + b(i) = i % 2 ? 1 : i + 1 ; + } +}; + +template< class Space > +void TestViewAggregateReduction() +{ + const int count = 2 ; + const long result = count % 2 ? ( count * ( ( count + 1 ) / 2 ) ) + : ( ( count / 2 ) * ( count + 1 ) ); + + Kokkos::View< long * , Space > a("a",count); + Kokkos::View< long * , Space > b("b",count); + Kokkos::View< StaticArray<long,4> * , Space > a4("a4",count); + Kokkos::View< StaticArray<long,4> * , Space > b4("b4",count); + Kokkos::View< StaticArray<long,10> * , Space > a10("a10",count); + Kokkos::View< StaticArray<long,10> * , Space > b10("b10",count); + + Kokkos::parallel_for( count , FILL<long,Space>(a,b) ); + Kokkos::parallel_for( count , FILL< StaticArray<long,4> , Space >(a4,b4) ); + Kokkos::parallel_for( count , FILL< StaticArray<long,10> , Space >(a10,b10) ); + + long r = 0; + StaticArray<long,4> r4 ; + StaticArray<long,10> r10 ; + + Kokkos::parallel_reduce( count , DOT<long,Space>(a,b) , r ); + Kokkos::parallel_reduce( count , DOT< StaticArray<long,4> , Space >(a4,b4) , r4 ); + Kokkos::parallel_reduce( count , DOT< StaticArray<long,10> , Space >(a10,b10) , r10 ); + + ASSERT_EQ( result , r ); + for ( int i = 0 ; i < 10 ; ++i ) { ASSERT_EQ( result , r10.value[i] ); } + for ( int i = 0 ; i < 4 ; ++i ) { ASSERT_EQ( result , r4.value[i] ); } +} + +} + +#endif /* #ifndef TEST_AGGREGATE_REDUCTION_HPP */ + diff --git a/lib/kokkos/core/unit_test/TestAllocationTracker.cpp b/lib/kokkos/core/unit_test/TestAllocationTracker.cpp new file mode 100755 index 0000000000000000000000000000000000000000..371b0ac7588c7239ebf8a7f146faea63bc37faa2 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestAllocationTracker.cpp @@ -0,0 +1,145 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <iostream> +#include <vector> + +#include <Kokkos_Core.hpp> + +#include <impl/Kokkos_AllocationTracker.hpp> +#include <impl/Kokkos_BasicAllocators.hpp> + +namespace Test { + +class alocation_tracker : public ::testing::Test { +protected: + static void SetUpTestCase() + { + Kokkos::initialize(); + } + + static void TearDownTestCase() + { + Kokkos::finalize(); + } +}; + +TEST_F( alocation_tracker, simple) +{ + using namespace Kokkos::Impl; + + { + AllocationTracker tracker; + EXPECT_FALSE( tracker.is_valid() ); + } + + // test ref count and label + { + int size = 100; + std::vector<AllocationTracker> trackers(size); + + trackers[0] = AllocationTracker( MallocAllocator(), 128,"Test"); + + for (int i=0; i<size; ++i) { + trackers[i] = trackers[0]; + } + + EXPECT_EQ(100u, trackers[0].ref_count()); + EXPECT_EQ(std::string("Test"), std::string(trackers[0].label())); + } + + + // test circular list + { + int num_allocs = 3000; + unsigned ref_count = 100; + + std::vector<AllocationTracker> trackers(num_allocs); + + for (int i=0; i<num_allocs; ++i) { + trackers[i] = AllocationTracker( MallocAllocator(), 128, "Test"); + std::vector<AllocationTracker> ref_trackers(ref_count); + for (unsigned j=0; j<ref_count; ++j) { + ref_trackers[j] = trackers[i]; + } + EXPECT_EQ( ref_count + 1u, trackers[i].ref_count() ); + } + + for (int i=0; i<num_allocs; ++i) { + EXPECT_EQ( 1u, trackers[i].ref_count() ); + } + } +} + +TEST_F( alocation_tracker, force_leaks) +{ +// uncomment to force memory leaks +#if 0 + using namespace Kokkos::Impl; + Kokkos::kokkos_malloc("Forced Leak", 4096*10); + Kokkos::kokkos_malloc<Kokkos::HostSpace>("Forced Leak", 4096*10); +#endif +} + +TEST_F( alocation_tracker, disable_reference_counting) +{ + using namespace Kokkos::Impl; + // test ref count and label + { + int size = 100; + std::vector<AllocationTracker> trackers(size); + + trackers[0] = AllocationTracker( MallocAllocator(), 128,"Test"); + + for (int i=1; i<size; ++i) { + trackers[i] = CopyWithoutTracking::apply(trackers[0]); + } + + EXPECT_EQ(1u, trackers[0].ref_count()); + EXPECT_EQ(std::string("Test"), std::string(trackers[0].label())); + } +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/TestAtomic.hpp b/lib/kokkos/core/unit_test/TestAtomic.hpp new file mode 100755 index 0000000000000000000000000000000000000000..d273c287e8cb41b7dd836b3c72266f42d740bcbf --- /dev/null +++ b/lib/kokkos/core/unit_test/TestAtomic.hpp @@ -0,0 +1,376 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +namespace TestAtomic { + +// Struct for testing arbitrary size atomics + +template<int N> +struct SuperScalar { + double val[N]; + + KOKKOS_INLINE_FUNCTION + SuperScalar() { + for(int i=0; i<N; i++) + val[i] = 0.0; + } + + KOKKOS_INLINE_FUNCTION + SuperScalar(const SuperScalar& src) { + for(int i=0; i<N; i++) + val[i] = src.val[i]; + } + + KOKKOS_INLINE_FUNCTION + SuperScalar(const volatile SuperScalar& src) { + for(int i=0; i<N; i++) + val[i] = src.val[i]; + } + + KOKKOS_INLINE_FUNCTION + SuperScalar& operator = (const SuperScalar& src) { + for(int i=0; i<N; i++) + val[i] = src.val[i]; + return *this; + } + + KOKKOS_INLINE_FUNCTION + SuperScalar& operator = (const volatile SuperScalar& src) { + for(int i=0; i<N; i++) + val[i] = src.val[i]; + return *this; + } + + KOKKOS_INLINE_FUNCTION + volatile SuperScalar& operator = (const SuperScalar& src) volatile { + for(int i=0; i<N; i++) + val[i] = src.val[i]; + return *this; + } + + KOKKOS_INLINE_FUNCTION + SuperScalar operator + (const SuperScalar& src) { + SuperScalar tmp = *this; + for(int i=0; i<N; i++) + tmp.val[i] += src.val[i]; + return tmp; + } + + KOKKOS_INLINE_FUNCTION + SuperScalar& operator += (const double& src) { + for(int i=0; i<N; i++) + val[i] += 1.0*(i+1)*src; + return *this; + } + + KOKKOS_INLINE_FUNCTION + SuperScalar& operator += (const SuperScalar& src) { + for(int i=0; i<N; i++) + val[i] += src.val[i]; + return *this; + } + + KOKKOS_INLINE_FUNCTION + bool operator == (const SuperScalar& src) { + bool compare = true; + for(int i=0; i<N; i++) + compare = compare && ( val[i] == src.val[i]); + return compare; + } + + KOKKOS_INLINE_FUNCTION + bool operator != (const SuperScalar& src) { + bool compare = true; + for(int i=0; i<N; i++) + compare = compare && ( val[i] == src.val[i]); + return !compare; + } + + + + KOKKOS_INLINE_FUNCTION + SuperScalar(const double& src) { + for(int i=0; i<N; i++) + val[i] = 1.0 * (i+1) * src; + } + +}; + +template<int N> +std::ostream& operator<<(std::ostream& os, const SuperScalar<N>& dt) +{ + os << "{ "; + for(int i=0;i<N-1;i++) + os << dt.val[i] << ", "; + os << dt.val[N-1] << "}"; + return os; +} + +template<class T,class DEVICE_TYPE> +struct ZeroFunctor { + typedef DEVICE_TYPE execution_space; + typedef typename Kokkos::View<T,execution_space> type; + typedef typename Kokkos::View<T,execution_space>::HostMirror h_type; + type data; + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + data() = 0; + } +}; + +//--------------------------------------------------- +//--------------atomic_fetch_add--------------------- +//--------------------------------------------------- + +template<class T,class DEVICE_TYPE> +struct AddFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + Kokkos::atomic_fetch_add(&data(),(T)1); + } +}; + +template<class T, class execution_space > +T AddLoop(int loop) { + struct ZeroFunctor<T,execution_space> f_zero; + typename ZeroFunctor<T,execution_space>::type data("Data"); + typename ZeroFunctor<T,execution_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1,f_zero); + execution_space::fence(); + + struct AddFunctor<T,execution_space> f_add; + f_add.data = data; + Kokkos::parallel_for(loop,f_add); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + return val; +} + +template<class T> +T AddLoopSerial(int loop) { + T* data = new T[1]; + data[0] = 0; + + for(int i=0;i<loop;i++) + *data+=(T)1; + + T val = *data; + delete data; + return val; +} + +template<class T,class DEVICE_TYPE> +struct CASFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + T old = data(); + T newval, assumed; + do { + assumed = old; + newval = assumed + (T)1; + old = Kokkos::atomic_compare_exchange(&data(), assumed, newval); + } + while( old != assumed ); + } +}; + +template<class T, class execution_space > +T CASLoop(int loop) { + struct ZeroFunctor<T,execution_space> f_zero; + typename ZeroFunctor<T,execution_space>::type data("Data"); + typename ZeroFunctor<T,execution_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1,f_zero); + execution_space::fence(); + + struct CASFunctor<T,execution_space> f_cas; + f_cas.data = data; + Kokkos::parallel_for(loop,f_cas); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + T val = h_data(); + + return val; +} + +template<class T> +T CASLoopSerial(int loop) { + T* data = new T[1]; + data[0] = 0; + + for(int i=0;i<loop;i++) { + T assumed; + T newval; + T old; + do { + assumed = *data; + newval = assumed + (T)1; + old = *data; + *data = newval; + } + while(!(assumed==old)); + } + + T val = *data; + delete data; + return val; +} + +template<class T,class DEVICE_TYPE> +struct ExchFunctor{ + typedef DEVICE_TYPE execution_space; + typedef Kokkos::View<T,execution_space> type; + type data, data2; + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + T old = Kokkos::atomic_exchange(&data(),(T)i); + Kokkos::atomic_fetch_add(&data2(),old); + } +}; + +template<class T, class execution_space > +T ExchLoop(int loop) { + struct ZeroFunctor<T,execution_space> f_zero; + typename ZeroFunctor<T,execution_space>::type data("Data"); + typename ZeroFunctor<T,execution_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1,f_zero); + execution_space::fence(); + + typename ZeroFunctor<T,execution_space>::type data2("Data"); + typename ZeroFunctor<T,execution_space>::h_type h_data2("HData"); + f_zero.data = data2; + Kokkos::parallel_for(1,f_zero); + execution_space::fence(); + + struct ExchFunctor<T,execution_space> f_exch; + f_exch.data = data; + f_exch.data2 = data2; + Kokkos::parallel_for(loop,f_exch); + execution_space::fence(); + + Kokkos::deep_copy(h_data,data); + Kokkos::deep_copy(h_data2,data2); + T val = h_data() + h_data2(); + + return val; +} + +template<class T> +T ExchLoopSerial(int loop) { + T* data = new T[1]; + T* data2 = new T[1]; + data[0] = 0; + data2[0] = 0; + for(int i=0;i<loop;i++) { + T old = *data; + *data=(T) i; + *data2+=old; + } + + T val = *data2 + *data; + delete data; + delete data2; + return val; +} + +template<class T, class DeviceType > +T LoopVariant(int loop, int test) { + switch (test) { + case 1: return AddLoop<T,DeviceType>(loop); + case 2: return CASLoop<T,DeviceType>(loop); + case 3: return ExchLoop<T,DeviceType>(loop); + } + return 0; +} + +template<class T> +T LoopVariantSerial(int loop, int test) { + switch (test) { + case 1: return AddLoopSerial<T>(loop); + case 2: return CASLoopSerial<T>(loop); + case 3: return ExchLoopSerial<T>(loop); + } + return 0; +} + +template<class T,class DeviceType> +bool Loop(int loop, int test) +{ + T res = LoopVariant<T,DeviceType>(loop,test); + T resSerial = LoopVariantSerial<T>(loop,test); + + bool passed = true; + + if ( resSerial != res ) { + passed = false; + + std::cout << "Loop<" + << typeid(T).name() + << ">( test = " + << test << " FAILED : " + << resSerial << " != " << res + << std::endl ; + } + + + return passed ; +} + +} + diff --git a/lib/kokkos/core/unit_test/TestCXX11.hpp b/lib/kokkos/core/unit_test/TestCXX11.hpp new file mode 100755 index 0000000000000000000000000000000000000000..f48c76de508c1c828466955012dfaa76fb925866 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestCXX11.hpp @@ -0,0 +1,319 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#include <Kokkos_Core.hpp> + +namespace TestCXX11 { + +template<class DeviceType> +struct FunctorAddTest{ + typedef Kokkos::View<double**,DeviceType> view_type; + view_type a_, b_; + typedef DeviceType execution_space; + FunctorAddTest(view_type & a, view_type &b):a_(a),b_(b) {} + void operator() (const int& i) const { + b_(i,0) = a_(i,1) + a_(i,2); + b_(i,1) = a_(i,0) - a_(i,3); + b_(i,2) = a_(i,4) + a_(i,0); + b_(i,3) = a_(i,2) - a_(i,1); + b_(i,4) = a_(i,3) + a_(i,4); + } + + typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ; + void operator() (const team_member & dev) const { + int i = dev.league_rank()*dev.team_size() + dev.team_rank(); + b_(i,0) = a_(i,1) + a_(i,2); + b_(i,1) = a_(i,0) - a_(i,3); + b_(i,2) = a_(i,4) + a_(i,0); + b_(i,3) = a_(i,2) - a_(i,1); + b_(i,4) = a_(i,3) + a_(i,4); + } +}; + +template<class DeviceType, bool PWRTest> +double AddTestFunctor() { + + typedef Kokkos::TeamPolicy<DeviceType> policy_type ; + + Kokkos::View<double**,DeviceType> a("A",100,5); + Kokkos::View<double**,DeviceType> b("B",100,5); + typename Kokkos::View<double**,DeviceType>::HostMirror h_a = Kokkos::create_mirror_view(a); + typename Kokkos::View<double**,DeviceType>::HostMirror h_b = Kokkos::create_mirror_view(b); + + for(int i=0;i<100;i++) { + for(int j=0;j<5;j++) + h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j; + } + Kokkos::deep_copy(a,h_a); + + if(PWRTest==false) + Kokkos::parallel_for(100,FunctorAddTest<DeviceType>(a,b)); + else + Kokkos::parallel_for(policy_type(25,4),FunctorAddTest<DeviceType>(a,b)); + Kokkos::deep_copy(h_b,b); + + double result = 0; + for(int i=0;i<100;i++) { + for(int j=0;j<5;j++) + result += h_b(i,j); + } + + return result; +} + + + +#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) +template<class DeviceType, bool PWRTest> +double AddTestLambda() { + + typedef Kokkos::TeamPolicy<DeviceType> policy_type ; + + Kokkos::View<double**,DeviceType> a("A",100,5); + Kokkos::View<double**,DeviceType> b("B",100,5); + typename Kokkos::View<double**,DeviceType>::HostMirror h_a = Kokkos::create_mirror_view(a); + typename Kokkos::View<double**,DeviceType>::HostMirror h_b = Kokkos::create_mirror_view(b); + + for(int i=0;i<100;i++) { + for(int j=0;j<5;j++) + h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j; + } + Kokkos::deep_copy(a,h_a); + + if(PWRTest==false) { + Kokkos::parallel_for(100,[=](const int& i) { + b(i,0) = a(i,1) + a(i,2); + b(i,1) = a(i,0) - a(i,3); + b(i,2) = a(i,4) + a(i,0); + b(i,3) = a(i,2) - a(i,1); + b(i,4) = a(i,3) + a(i,4); + }); + } else { + typedef typename policy_type::member_type team_member ; + Kokkos::parallel_for(policy_type(25,4),[=](const team_member & dev) { + int i = dev.league_rank()*dev.team_size() + dev.team_rank(); + b(i,0) = a(i,1) + a(i,2); + b(i,1) = a(i,0) - a(i,3); + b(i,2) = a(i,4) + a(i,0); + b(i,3) = a(i,2) - a(i,1); + b(i,4) = a(i,3) + a(i,4); + }); + } + Kokkos::deep_copy(h_b,b); + + double result = 0; + for(int i=0;i<100;i++) { + for(int j=0;j<5;j++) + result += h_b(i,j); + } + + return result; +} + +#else +template<class DeviceType, bool PWRTest> +double AddTestLambda() { + return AddTestFunctor<DeviceType,PWRTest>(); +} +#endif + + +template<class DeviceType> +struct FunctorReduceTest{ + typedef Kokkos::View<double**,DeviceType> view_type; + view_type a_; + typedef DeviceType execution_space; + typedef double value_type; + FunctorReduceTest(view_type & a):a_(a) {} + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i, value_type& sum) const { + sum += a_(i,1) + a_(i,2); + sum += a_(i,0) - a_(i,3); + sum += a_(i,4) + a_(i,0); + sum += a_(i,2) - a_(i,1); + sum += a_(i,3) + a_(i,4); + } + + typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ; + + KOKKOS_INLINE_FUNCTION + void operator() (const team_member & dev, value_type& sum) const { + int i = dev.league_rank()*dev.team_size() + dev.team_rank(); + sum += a_(i,1) + a_(i,2); + sum += a_(i,0) - a_(i,3); + sum += a_(i,4) + a_(i,0); + sum += a_(i,2) - a_(i,1); + sum += a_(i,3) + a_(i,4); + } + KOKKOS_INLINE_FUNCTION + void init(value_type& update) const {update = 0.0;} + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& update, volatile value_type const& input) const {update += input;} +}; + +template<class DeviceType, bool PWRTest> +double ReduceTestFunctor() { + + typedef Kokkos::TeamPolicy<DeviceType> policy_type ; + typedef Kokkos::View<double**,DeviceType> view_type ; + typedef Kokkos::View<double,typename view_type::host_mirror_space,Kokkos::MemoryUnmanaged> unmanaged_result ; + + view_type a("A",100,5); + typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a); + + for(int i=0;i<100;i++) { + for(int j=0;j<5;j++) + h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j; + } + Kokkos::deep_copy(a,h_a); + + double result = 0.0; + if(PWRTest==false) + Kokkos::parallel_reduce(100,FunctorReduceTest<DeviceType>(a), unmanaged_result( & result )); + else + Kokkos::parallel_reduce(policy_type(25,4),FunctorReduceTest<DeviceType>(a), unmanaged_result( & result )); + + return result; +} + +#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) +template<class DeviceType, bool PWRTest> +double ReduceTestLambda() { + + typedef Kokkos::TeamPolicy<DeviceType> policy_type ; + typedef Kokkos::View<double**,DeviceType> view_type ; + typedef Kokkos::View<double,typename view_type::host_mirror_space,Kokkos::MemoryUnmanaged> unmanaged_result ; + + view_type a("A",100,5); + typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a); + + for(int i=0;i<100;i++) { + for(int j=0;j<5;j++) + h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j; + } + Kokkos::deep_copy(a,h_a); + + double result = 0.0; + + if(PWRTest==false) { + Kokkos::parallel_reduce(100,[=](const int& i, double& sum) { + sum += a(i,1) + a(i,2); + sum += a(i,0) - a(i,3); + sum += a(i,4) + a(i,0); + sum += a(i,2) - a(i,1); + sum += a(i,3) + a(i,4); + }, unmanaged_result( & result ) ); + } else { + typedef typename policy_type::member_type team_member ; + Kokkos::parallel_reduce(policy_type(25,4),[=](const team_member & dev, double& sum) { + int i = dev.league_rank()*dev.team_size() + dev.team_rank(); + sum += a(i,1) + a(i,2); + sum += a(i,0) - a(i,3); + sum += a(i,4) + a(i,0); + sum += a(i,2) - a(i,1); + sum += a(i,3) + a(i,4); + }, unmanaged_result( & result ) ); + } + + return result; +} + +#else +template<class DeviceType, bool PWRTest> +double ReduceTestLambda() { + return ReduceTestFunctor<DeviceType,PWRTest>(); +} +#endif + +template<class DeviceType> +double TestVariantLambda(int test) { + switch (test) { + case 1: return AddTestLambda<DeviceType,false>(); + case 2: return AddTestLambda<DeviceType,true>(); + case 3: return ReduceTestLambda<DeviceType,false>(); + case 4: return ReduceTestLambda<DeviceType,true>(); + } + return 0; +} + + +template<class DeviceType> +double TestVariantFunctor(int test) { + switch (test) { + case 1: return AddTestFunctor<DeviceType,false>(); + case 2: return AddTestFunctor<DeviceType,true>(); + case 3: return ReduceTestFunctor<DeviceType,false>(); + case 4: return ReduceTestFunctor<DeviceType,true>(); + } + return 0; +} + +template<class DeviceType> +bool Test(int test) { + +#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA + double res_functor = TestVariantFunctor<DeviceType>(test); + double res_lambda = TestVariantLambda<DeviceType>(test); + + char testnames[5][256] = {" " + ,"AddTest","AddTest TeamPolicy" + ,"ReduceTest","ReduceTest TeamPolicy" + }; + bool passed = true; + + if ( res_functor != res_lambda ) { + passed = false; + + std::cout << "CXX11 ( test = '" + << testnames[test] << "' FAILED : " + << res_functor << " != " << res_lambda + << std::endl ; + } + + return passed ; +#else + return true; +#endif +} + +} diff --git a/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp b/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp new file mode 100755 index 0000000000000000000000000000000000000000..9d20079b2fb13730feac99002a9c2590b6b800ff --- /dev/null +++ b/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp @@ -0,0 +1,103 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#include <Kokkos_Core.hpp> + +#ifndef TESTCXX11DEDUCTION_HPP +#define TESTCXX11DEDUCTION_HPP + +namespace TestCXX11 { + +#if defined( KOKKOS_HAVE_CXX11 ) + +struct TestReductionDeductionTagA {}; +struct TestReductionDeductionTagB {}; + +template < class ExecSpace > +struct TestReductionDeductionFunctor { + + // KOKKOS_INLINE_FUNCTION + // void operator()( long i , long & value ) const + // { value += i + 1 ; } + + KOKKOS_INLINE_FUNCTION + void operator()( TestReductionDeductionTagA , long i , long & value ) const + { value += ( 2 * i + 1 ) + ( 2 * i + 2 ); } + + KOKKOS_INLINE_FUNCTION + void operator()( const TestReductionDeductionTagB & , const long i , long & value ) const + { value += ( 3 * i + 1 ) + ( 3 * i + 2 ) + ( 3 * i + 3 ) ; } + +}; + +template< class ExecSpace > +void test_reduction_deduction() +{ + typedef TestReductionDeductionFunctor< ExecSpace > Functor ; + + const long N = 50 ; + // const long answer = N % 2 ? ( N * ((N+1)/2 )) : ( (N/2) * (N+1) ); + const long answerA = N % 2 ? ( (2*N) * (((2*N)+1)/2 )) : ( ((2*N)/2) * ((2*N)+1) ); + const long answerB = N % 2 ? ( (3*N) * (((3*N)+1)/2 )) : ( ((3*N)/2) * ((3*N)+1) ); + long result = 0 ; + + // Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace>(0,N) , Functor() , result ); + // ASSERT_EQ( answer , result ); + + Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,TestReductionDeductionTagA>(0,N) , Functor() , result ); + ASSERT_EQ( answerA , result ); + + Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,TestReductionDeductionTagB>(0,N) , Functor() , result ); + ASSERT_EQ( answerB , result ); +} + +#else /* ! defined( KOKKOS_HAVE_CXX11 ) */ + +template< class ExecSpace > +void test_reduction_deduction() {} + +#endif /* ! defined( KOKKOS_HAVE_CXX11 ) */ + +} + +#endif + diff --git a/lib/kokkos/core/unit_test/TestCompilerMacros.hpp b/lib/kokkos/core/unit_test/TestCompilerMacros.hpp new file mode 100755 index 0000000000000000000000000000000000000000..dfa2250c04ae8cc785383b1f64a127ad40279f57 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestCompilerMacros.hpp @@ -0,0 +1,93 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#define KOKKOS_PRAGMA_UNROLL(a) + +namespace TestCompilerMacros { + +template<class DEVICE_TYPE> +struct AddFunctor { + typedef DEVICE_TYPE execution_space; + typedef typename Kokkos::View<int**,execution_space> type; + type a,b; + int length; + + AddFunctor(type a_, type b_):a(a_),b(b_),length(a.dimension_1()) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { +#ifdef KOKKOS_HAVE_PRAGMA_UNROLL + #pragma unroll +#endif +#ifdef KOKKOS_HAVE_PRAGMA_IVDEP + #pragma ivdep +#endif +#ifdef KOKKOS_HAVE_PRAGMA_VECTOR + #pragma vector always +#endif +#ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT + #pragma loop count(128) +#endif +#ifdef KOKKOS_HAVE_PRAGMA_SIMD + #pragma simd +#endif + for(int j=0;j<length;j++) + a(i,j) += b(i,j); + } +}; + +template<class DeviceType> +bool Test() { + typedef typename Kokkos::View<int**,DeviceType> type; + type a("A",1024,128); + type b("B",1024,128); + + AddFunctor<DeviceType> f(a,b); + Kokkos::parallel_for(1024,f); + DeviceType::fence(); + return true; +} + +} diff --git a/lib/kokkos/core/unit_test/TestCuda.cpp b/lib/kokkos/core/unit_test/TestCuda.cpp new file mode 100755 index 0000000000000000000000000000000000000000..4a74d1f1836f3cd3160e683ccbeae41fb45f563a --- /dev/null +++ b/lib/kokkos/core/unit_test/TestCuda.cpp @@ -0,0 +1,495 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <iostream> + +#include <Kokkos_Core.hpp> + +//---------------------------------------------------------------------------- + +#include <impl/Kokkos_ViewTileLeft.hpp> + +//---------------------------------------------------------------------------- + +#include <TestSharedAlloc.hpp> +#include <TestViewMapping.hpp> + +#include <TestViewImpl.hpp> +#include <TestAtomic.hpp> + +#include <TestViewAPI.hpp> +#include <TestViewSubview.hpp> +#include <TestTile.hpp> + +#include <TestReduce.hpp> +#include <TestScan.hpp> +#include <TestRange.hpp> +#include <TestTeam.hpp> +#include <TestAggregate.hpp> +#include <TestAggregateReduction.hpp> +#include <TestCompilerMacros.hpp> +#include <TestMemorySpaceTracking.hpp> +#include <TestTeamVector.hpp> +#include <TestTemplateMetaFunctions.hpp> +#include <TestCXX11Deduction.hpp> + +//---------------------------------------------------------------------------- + +class cuda : public ::testing::Test { +protected: + static void SetUpTestCase() + { + Kokkos::Cuda::print_configuration( std::cout ); + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) ); + } + static void TearDownTestCase() + { + Kokkos::Cuda::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +}; + +//---------------------------------------------------------------------------- + +namespace Test { + +__global__ +void test_abort() +{ + Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< + Kokkos::CudaSpace , + Kokkos::HostSpace >::verify(); +} + +__global__ +void test_cuda_spaces_int_value( int * ptr ) +{ + if ( *ptr == 42 ) { *ptr = 2 * 42 ; } +} + + +TEST_F( cuda , compiler_macros ) +{ + ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Cuda >() ) ); +} + +TEST_F( cuda , memory_space ) +{ + TestMemorySpace< Kokkos::Cuda >(); +} + +TEST_F( cuda, spaces ) +{ + if ( Kokkos::CudaUVMSpace::available() ) { + + Kokkos::Impl::AllocationTracker tracker = Kokkos::CudaUVMSpace::allocate_and_track("uvm_ptr",sizeof(int)); + + int * uvm_ptr = (int*) tracker.alloc_ptr(); + + *uvm_ptr = 42 ; + + Kokkos::Cuda::fence(); + test_cuda_spaces_int_value<<<1,1>>>(uvm_ptr); + Kokkos::Cuda::fence(); + + EXPECT_EQ( *uvm_ptr, int(2*42) ); + + } +} + +//---------------------------------------------------------------------------- + +TEST_F( cuda , impl_shared_alloc ) +{ + test_shared_alloc< Kokkos::CudaSpace , Kokkos::HostSpace::execution_space >(); + test_shared_alloc< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >(); + test_shared_alloc< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >(); +} + +TEST_F( cuda , impl_view_mapping ) +{ + test_view_mapping< Kokkos::Cuda >(); + test_view_mapping_subview< Kokkos::Cuda >(); + test_view_mapping_operator< Kokkos::Cuda >(); + TestViewMappingAtomic< Kokkos::Cuda >::run(); +} + +template< class MemSpace > +struct TestViewCudaTexture { + + enum { N = 1000 }; + + using V = Kokkos::Experimental::View<double*,MemSpace> ; + using T = Kokkos::Experimental::View<const double*, MemSpace, Kokkos::MemoryRandomAccess > ; + + V m_base ; + T m_tex ; + + struct TagInit {}; + struct TagTest {}; + + KOKKOS_INLINE_FUNCTION + void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; } + + KOKKOS_INLINE_FUNCTION + void operator()( const TagTest & , const int i , long & error_count ) const + { if ( m_tex[i] != i + 1 ) ++error_count ; } + + TestViewCudaTexture() + : m_base("base",N) + , m_tex( m_base ) + {} + + static void run() + { + EXPECT_TRUE( ( std::is_same< typename V::reference_type + , double & + >::value ) ); + + EXPECT_TRUE( ( std::is_same< typename T::reference_type + , const double + >::value ) ); + + EXPECT_TRUE( V::reference_type_is_lvalue_reference ); // An ordinary view + EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value + + TestViewCudaTexture self ; + Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda , TagInit >(0,N) , self ); + long error_count = -1 ; + Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda , TagTest >(0,N) , self , error_count ); + EXPECT_EQ( error_count , 0 ); + } +}; + + +TEST_F( cuda , impl_view_texture ) +{ + TestViewCudaTexture< Kokkos::CudaSpace >::run(); + TestViewCudaTexture< Kokkos::CudaUVMSpace >::run(); +} + +template< class MemSpace , class ExecSpace > +struct TestViewCudaAccessible { + + enum { N = 1000 }; + + using V = Kokkos::Experimental::View<double*,MemSpace> ; + + V m_base ; + + struct TagInit {}; + struct TagTest {}; + + KOKKOS_INLINE_FUNCTION + void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; } + + KOKKOS_INLINE_FUNCTION + void operator()( const TagTest & , const int i , long & error_count ) const + { if ( m_base[i] != i + 1 ) ++error_count ; } + + TestViewCudaAccessible() + : m_base("base",N) + {} + + static void run() + { + TestViewCudaAccessible self ; + Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space , TagInit >(0,N) , self ); + MemSpace::execution_space::fence(); + // Next access is a different execution space, must complete prior kernel. + long error_count = -1 ; + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagTest >(0,N) , self , error_count ); + EXPECT_EQ( error_count , 0 ); + } +}; + + +TEST_F( cuda , impl_view_accessible ) +{ + TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run(); + + TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::Cuda >::run(); + TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >::run(); + + TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run(); + TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run(); +} + +//---------------------------------------------------------------------------- + +TEST_F( cuda, view_impl ) +{ + // test_abort<<<32,32>>>(); // Aborts the kernel with CUDA version 4.1 or greater + + test_view_impl< Kokkos::Cuda >(); +} + +TEST_F( cuda, view_api ) +{ + typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess > > view_texture_managed ; + typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess | Kokkos::Unmanaged > > view_texture_unmanaged ; + + TestViewAPI< double , Kokkos::Cuda >(); + +#if 0 + Kokkos::View<double, Kokkos::Cuda > x("x"); + Kokkos::View<double[1], Kokkos::Cuda > y("y"); + // *x = 10 ; + // x() = 10 ; + // y[0] = 10 ; + // y(0) = 10 ; +#endif +} + +TEST_F( cuda, view_subview_auto_1d_left ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Cuda >(); +} + +TEST_F( cuda, view_subview_auto_1d_right ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Cuda >(); +} + +TEST_F( cuda, view_subview_auto_1d_stride ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Cuda >(); +} + +TEST_F( cuda, view_subview_assign_strided ) { + TestViewSubview::test_1d_strided_assignment< Kokkos::Cuda >(); +} + +TEST_F( cuda, view_subview_left_0 ) { + TestViewSubview::test_left_0< Kokkos::CudaUVMSpace >(); +} + +TEST_F( cuda, view_subview_left_1 ) { + TestViewSubview::test_left_1< Kokkos::CudaUVMSpace >(); +} + +TEST_F( cuda, view_subview_left_2 ) { + TestViewSubview::test_left_2< Kokkos::CudaUVMSpace >(); +} + +TEST_F( cuda, view_subview_left_3 ) { + TestViewSubview::test_left_3< Kokkos::CudaUVMSpace >(); +} + +TEST_F( cuda, view_subview_right_0 ) { + TestViewSubview::test_right_0< Kokkos::CudaUVMSpace >(); +} + +TEST_F( cuda, view_subview_right_1 ) { + TestViewSubview::test_right_1< Kokkos::CudaUVMSpace >(); +} + +TEST_F( cuda, view_subview_right_3 ) { + TestViewSubview::test_right_3< Kokkos::CudaUVMSpace >(); +} + + + + +TEST_F( cuda, range_tag ) +{ + TestRange< Kokkos::Cuda >::test_for(1000); + TestRange< Kokkos::Cuda >::test_reduce(1000); + TestRange< Kokkos::Cuda >::test_scan(1000); +} + +TEST_F( cuda, team_tag ) +{ + TestTeamPolicy< Kokkos::Cuda >::test_for(1000); + TestTeamPolicy< Kokkos::Cuda >::test_reduce(1000); +} + +TEST_F( cuda, reduce ) +{ + TestReduce< long , Kokkos::Cuda >( 10000000 ); + TestReduce< double , Kokkos::Cuda >( 1000000 ); +} + +TEST_F( cuda, reduce_team ) +{ + TestReduceTeam< long , Kokkos::Cuda >( 10000000 ); + TestReduceTeam< double , Kokkos::Cuda >( 1000000 ); +} + +TEST_F( cuda, shared_team ) +{ + TestSharedTeam< Kokkos::Cuda >(); +} + +TEST_F( cuda, reduce_dynamic ) +{ + TestReduceDynamic< long , Kokkos::Cuda >( 10000000 ); + TestReduceDynamic< double , Kokkos::Cuda >( 1000000 ); +} + +TEST_F( cuda, reduce_dynamic_view ) +{ + TestReduceDynamicView< long , Kokkos::Cuda >( 10000000 ); + TestReduceDynamicView< double , Kokkos::Cuda >( 1000000 ); +} + +TEST_F( cuda, atomic ) +{ + const int loop_count = 1e3 ; + + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,3) ) ); +} + +//---------------------------------------------------------------------------- + +TEST_F( cuda, tile_layout) +{ + TestTile::test< Kokkos::Cuda , 1 , 1 >( 1 , 1 ); + TestTile::test< Kokkos::Cuda , 1 , 1 >( 2 , 3 ); + TestTile::test< Kokkos::Cuda , 1 , 1 >( 9 , 10 ); + + TestTile::test< Kokkos::Cuda , 2 , 2 >( 1 , 1 ); + TestTile::test< Kokkos::Cuda , 2 , 2 >( 2 , 3 ); + TestTile::test< Kokkos::Cuda , 2 , 2 >( 4 , 4 ); + TestTile::test< Kokkos::Cuda , 2 , 2 >( 9 , 9 ); + + TestTile::test< Kokkos::Cuda , 2 , 4 >( 9 , 9 ); + TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 ); + + TestTile::test< Kokkos::Cuda , 4 , 4 >( 1 , 1 ); + TestTile::test< Kokkos::Cuda , 4 , 4 >( 4 , 4 ); + TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 ); + TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 11 ); + + TestTile::test< Kokkos::Cuda , 8 , 8 >( 1 , 1 ); + TestTile::test< Kokkos::Cuda , 8 , 8 >( 4 , 4 ); + TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 9 ); + TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 11 ); +} + + +TEST_F( cuda , view_aggregate ) +{ + TestViewAggregate< Kokkos::Cuda >(); + TestViewAggregateReduction< Kokkos::Cuda >(); +} + + +TEST_F( cuda , scan ) +{ + TestScan< Kokkos::Cuda >::test_range( 1 , 1000 ); + TestScan< Kokkos::Cuda >( 1000000 ); + TestScan< Kokkos::Cuda >( 10000000 ); + Kokkos::Cuda::fence(); +} + +TEST_F( cuda , team_scan ) +{ + TestScanTeam< Kokkos::Cuda >( 10 ); + TestScanTeam< Kokkos::Cuda >( 10000 ); +} + +} + +//---------------------------------------------------------------------------- + +TEST_F( cuda , template_meta_functions ) +{ + TestTemplateMetaFunctions<int, Kokkos::Cuda >(); +} + +//---------------------------------------------------------------------------- + +#ifdef KOKKOS_HAVE_CXX11 + +namespace Test { + +TEST_F( cuda , reduction_deduction ) +{ + TestCXX11::test_reduction_deduction< Kokkos::Cuda >(); +} + +TEST_F( cuda , team_vector ) +{ + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(0) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(1) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(2) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(3) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(4) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(5) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(6) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(7) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(8) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(9) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) ); +} + +} +#endif + diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp new file mode 100755 index 0000000000000000000000000000000000000000..d1a525f9e5952034295efba204d74e39b0461129 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp @@ -0,0 +1,250 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_HAVE_CUDA) || defined(__CUDACC__) +//---------------------------------------------------------------------------- + +#include <TestViewImpl.hpp> +#include <TestAtomic.hpp> + +#include <TestViewAPI.hpp> + +#include <TestReduce.hpp> +#include <TestScan.hpp> +#include <TestTeam.hpp> +#include <TestAggregate.hpp> +#include <TestCompilerMacros.hpp> +#include <TestCXX11.hpp> +#include <TestTeamVector.hpp> + +namespace Test { + +class defaultdevicetype : public ::testing::Test { +protected: + static void SetUpTestCase() + { + Kokkos::initialize(); + } + + static void TearDownTestCase() + { + Kokkos::finalize(); + } +}; + + +TEST_F( defaultdevicetype, view_impl) { + test_view_impl< Kokkos::DefaultExecutionSpace >(); +} + +TEST_F( defaultdevicetype, view_api) { + TestViewAPI< double , Kokkos::DefaultExecutionSpace >(); +} + +TEST_F( defaultdevicetype, long_reduce) { + TestReduce< long , Kokkos::DefaultExecutionSpace >( 100000 ); +} + +TEST_F( defaultdevicetype, double_reduce) { + TestReduce< double , Kokkos::DefaultExecutionSpace >( 100000 ); +} + +TEST_F( defaultdevicetype, long_reduce_dynamic ) { + TestReduceDynamic< long , Kokkos::DefaultExecutionSpace >( 100000 ); +} + +TEST_F( defaultdevicetype, double_reduce_dynamic ) { + TestReduceDynamic< double , Kokkos::DefaultExecutionSpace >( 100000 ); +} + +TEST_F( defaultdevicetype, long_reduce_dynamic_view ) { + TestReduceDynamicView< long , Kokkos::DefaultExecutionSpace >( 100000 ); +} + + +TEST_F( defaultdevicetype , atomics ) +{ + const int loop_count = 1e4 ; + + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,3) ) ); +} + +/*TEST_F( defaultdevicetype , view_remap ) +{ + enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 }; + + typedef Kokkos::View< double*[N1][N2][N3] , + Kokkos::LayoutRight , + Kokkos::DefaultExecutionSpace > output_type ; + + typedef Kokkos::View< int**[N2][N3] , + Kokkos::LayoutLeft , + Kokkos::DefaultExecutionSpace > input_type ; + + typedef Kokkos::View< int*[N0][N2][N3] , + Kokkos::LayoutLeft , + Kokkos::DefaultExecutionSpace > diff_type ; + + output_type output( "output" , N0 ); + input_type input ( "input" , N0 , N1 ); + diff_type diff ( "diff" , N0 ); + + int value = 0 ; + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) { + input(i0,i1,i2,i3) = ++value ; + }}}} + + // Kokkos::deep_copy( diff , input ); // throw with incompatible shape + Kokkos::deep_copy( output , input ); + + value = 0 ; + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) { + ++value ; + ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) ); + }}}} +}*/ + +//---------------------------------------------------------------------------- + + +TEST_F( defaultdevicetype , view_aggregate ) +{ + TestViewAggregate< Kokkos::DefaultExecutionSpace >(); +} + +//---------------------------------------------------------------------------- + +TEST_F( defaultdevicetype , scan ) +{ + TestScan< Kokkos::DefaultExecutionSpace >::test_range( 1 , 1000 ); + TestScan< Kokkos::DefaultExecutionSpace >( 1000000 ); + TestScan< Kokkos::DefaultExecutionSpace >( 10000000 ); + Kokkos::DefaultExecutionSpace::fence(); +} + + +TEST_F( defaultdevicetype , team_scan ) +{ + TestScanTeam< Kokkos::DefaultExecutionSpace >( 10 ); + TestScanTeam< Kokkos::DefaultExecutionSpace >( 10000 ); +} + +//---------------------------------------------------------------------------- + +TEST_F( defaultdevicetype , compiler_macros ) +{ + ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::DefaultExecutionSpace >() ) ); +} + + +//---------------------------------------------------------------------------- +#if defined (KOKKOS_HAVE_CXX11) +TEST_F( defaultdevicetype , cxx11 ) +{ + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(1) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(2) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(3) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(4) ) ); +} +#endif + +#if defined (KOKKOS_HAVE_CXX11) +TEST_F( defaultdevicetype , team_vector ) +{ + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(0) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(1) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(2) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(3) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(4) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(5) ) ); +} +#endif + +#if defined (KOKKOS_HAVE_CXX11) +TEST_F( defaultdevicetype , malloc ) +{ + int* data = (int*) Kokkos::kokkos_malloc(100*sizeof(int)); + ASSERT_NO_THROW(data = (int*) Kokkos::kokkos_realloc(data,120*sizeof(int))); + Kokkos::kokkos_free(data); +} +#endif + +} // namespace test + +#endif diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp new file mode 100755 index 0000000000000000000000000000000000000000..a1e3f8fb0adece50ce4f8f5e8b2204b66bb0fdc6 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp @@ -0,0 +1,390 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#ifdef KOKKOS_HAVE_OPENMP +#include <omp.h> +#endif + +#if !defined(KOKKOS_HAVE_CUDA) || defined(__CUDACC__) +//---------------------------------------------------------------------------- + +namespace Test { + +namespace Impl { + + char** init_kokkos_args(bool do_threads,bool do_numa,bool do_device,bool do_other, int& nargs, Kokkos::InitArguments& init_args) { + nargs = (do_threads?1:0) + + (do_numa?1:0) + + (do_device?1:0) + + (do_other?4:0); + char** args_kokkos = new char*[nargs]; + for(int i = 0; i < nargs; i++) + args_kokkos[i] = new char[20]; + + int threads_idx = do_other?1:0; + int numa_idx = (do_other?3:0) + (do_threads?1:0); + int device_idx = (do_other?3:0) + (do_threads?1:0) + (do_numa?1:0); + + + if(do_threads) { + int nthreads = 3; + +#ifdef KOKKOS_HAVE_OPENMP + if(omp_get_max_threads() < 3) + nthreads = omp_get_max_threads(); +#endif + + if(Kokkos::hwloc::available()) { + if(Kokkos::hwloc::get_available_threads_per_core()<3) + nthreads = Kokkos::hwloc::get_available_threads_per_core() + * Kokkos::hwloc::get_available_numa_count(); + } + +#ifdef KOKKOS_HAVE_SERIAL + if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value || + Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) { + nthreads = 1; + } +#endif + init_args.num_threads = nthreads; + sprintf(args_kokkos[threads_idx],"--threads=%i",nthreads); + } + + if(do_numa) { + int numa = 1; + if(Kokkos::hwloc::available()) + numa = Kokkos::hwloc::get_available_numa_count(); +#ifdef KOKKOS_HAVE_SERIAL + if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value || + Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) { + numa = 1; + } +#endif + + init_args.num_numa = numa; + sprintf(args_kokkos[numa_idx],"--numa=%i",numa); + } + + if(do_device) { + + init_args.device_id = 0; + sprintf(args_kokkos[device_idx],"--device=%i",0); + } + + if(do_other) { + sprintf(args_kokkos[0],"--dummyarg=1"); + sprintf(args_kokkos[threads_idx+(do_threads?1:0)],"--dummy2arg"); + sprintf(args_kokkos[threads_idx+(do_threads?1:0)+1],"dummy3arg"); + sprintf(args_kokkos[device_idx+(do_device?1:0)],"dummy4arg=1"); + } + + + return args_kokkos; + } + + Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa, bool do_device) { + Kokkos::InitArguments args; + + if(do_threads) { + int nthreads = 3; + +#ifdef KOKKOS_HAVE_OPENMP + if(omp_get_max_threads() < 3) + nthreads = omp_get_max_threads(); +#endif + + if(Kokkos::hwloc::available()) { + if(Kokkos::hwloc::get_available_threads_per_core()<3) + nthreads = Kokkos::hwloc::get_available_threads_per_core() + * Kokkos::hwloc::get_available_numa_count(); + } +#ifdef KOKKOS_HAVE_SERIAL + if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value || + Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) { + nthreads = 1; + } +#endif + + args.num_threads = nthreads; + } + + if(do_numa) { + int numa = 1; + if(Kokkos::hwloc::available()) + numa = Kokkos::hwloc::get_available_numa_count(); +#ifdef KOKKOS_HAVE_SERIAL + if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value || + Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) { + numa = 1; + } +#endif + args.num_numa = numa; + } + + if(do_device) { + args.device_id = 0; + } + + return args; + } + + void check_correct_initialization(const Kokkos::InitArguments& argstruct) { + ASSERT_EQ( Kokkos::DefaultExecutionSpace::is_initialized(), 1); + ASSERT_EQ( Kokkos::HostSpace::execution_space::is_initialized(), 1); + + //Figure out the number of threads the HostSpace ExecutionSpace should have initialized to + int expected_nthreads = argstruct.num_threads; + if(expected_nthreads<1) { + if(Kokkos::hwloc::available()) { + expected_nthreads = Kokkos::hwloc::get_available_numa_count() + * Kokkos::hwloc::get_available_cores_per_numa() + * Kokkos::hwloc::get_available_threads_per_core(); + } else { + #ifdef KOKKOS_HAVE_OPENMP + if(Kokkos::Impl::is_same<Kokkos::HostSpace::execution_space,Kokkos::OpenMP>::value) { + expected_nthreads = omp_get_max_threads(); + } else + #endif + expected_nthreads = 1; + + } + #ifdef KOKKOS_HAVE_SERIAL + if(Kokkos::Impl::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Serial>::value || + Kokkos::Impl::is_same<Kokkos::DefaultHostExecutionSpace,Kokkos::Serial>::value ) + expected_nthreads = 1; + #endif + } + + int expected_numa = argstruct.num_numa; + if(expected_numa<1) { + if(Kokkos::hwloc::available()) { + expected_numa = Kokkos::hwloc::get_available_numa_count(); + } else { + expected_numa = 1; + } + #ifdef KOKKOS_HAVE_SERIAL + if(Kokkos::Impl::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Serial>::value || + Kokkos::Impl::is_same<Kokkos::DefaultHostExecutionSpace,Kokkos::Serial>::value ) + expected_numa = 1; + #endif + } + ASSERT_EQ(Kokkos::HostSpace::execution_space::thread_pool_size(),expected_nthreads); + +#ifdef KOKKOS_HAVE_CUDA + if(Kokkos::Impl::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Cuda>::value) { + int device; + cudaGetDevice( &device ); + int expected_device = argstruct.device_id; + if(argstruct.device_id<0) { + expected_device = 0; + } + ASSERT_EQ(expected_device,device); + } +#endif + } + + //ToDo: Add check whether correct number of threads are actually started + void test_no_arguments() { + Kokkos::initialize(); + check_correct_initialization(Kokkos::InitArguments()); + Kokkos::finalize(); + } + + void test_commandline_args(int nargs, char** args, const Kokkos::InitArguments& argstruct) { + Kokkos::initialize(nargs,args); + check_correct_initialization(argstruct); + Kokkos::finalize(); + } + + void test_initstruct_args(const Kokkos::InitArguments& args) { + Kokkos::initialize(args); + check_correct_initialization(args); + Kokkos::finalize(); + } +} + +class defaultdevicetypeinit : public ::testing::Test { +protected: + static void SetUpTestCase() + { + } + + static void TearDownTestCase() + { + } +}; + + +TEST_F( defaultdevicetypeinit, no_args) { + Impl::test_no_arguments(); +} + +TEST_F( defaultdevicetypeinit, commandline_args_empty) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(false,false,false,false,nargs, argstruct); + Impl::test_commandline_args(nargs,args,argstruct); + for(int i = 0; i < nargs; i++) + delete [] args[i]; + delete [] args; +} + +TEST_F( defaultdevicetypeinit, commandline_args_other) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(false,false,false,true,nargs, argstruct); + Impl::test_commandline_args(nargs,args,argstruct); + for(int i = 0; i < nargs; i++) + delete [] args[i]; + delete [] args; +} + +TEST_F( defaultdevicetypeinit, commandline_args_nthreads) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(true,false,false,false,nargs, argstruct); + Impl::test_commandline_args(nargs,args,argstruct); + for(int i = 0; i < nargs; i++) + delete [] args[i]; + delete [] args; +} + +TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(true,true,false,false,nargs, argstruct); + Impl::test_commandline_args(nargs,args,argstruct); + for(int i = 0; i < nargs; i++) + delete [] args[i]; + delete [] args; +} + +TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(true,true,true,false,nargs, argstruct); + Impl::test_commandline_args(nargs,args,argstruct); + for(int i = 0; i < nargs; i++) + delete [] args[i]; + delete [] args; +} + +TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(true,false,true,false,nargs, argstruct); + Impl::test_commandline_args(nargs,args,argstruct); + for(int i = 0; i < nargs; i++) + delete [] args[i]; + delete [] args; +} + +TEST_F( defaultdevicetypeinit, commandline_args_numa_device) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(false,true,true,false,nargs, argstruct); + Impl::test_commandline_args(nargs,args,argstruct); + for(int i = 0; i < nargs; i++) + delete [] args[i]; + delete [] args; +} + +TEST_F( defaultdevicetypeinit, commandline_args_device) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(false,false,true,false,nargs, argstruct); + Impl::test_commandline_args(nargs,args,argstruct); + for(int i = 0; i < nargs; i++) + delete [] args[i]; + delete [] args; +} + +TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(true,true,true,true,nargs, argstruct); + Impl::test_commandline_args(nargs,args,argstruct); + for(int i = 0; i < nargs; i++) + delete [] args[i]; + delete [] args; +} + +TEST_F( defaultdevicetypeinit, initstruct_default) { + Kokkos::InitArguments args; + Impl::test_initstruct_args(args); +} + +TEST_F( defaultdevicetypeinit, initstruct_nthreads) { + Kokkos::InitArguments args = Impl::init_initstruct(true,false,false); + Impl::test_initstruct_args(args); +} + +TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa) { + Kokkos::InitArguments args = Impl::init_initstruct(true,true,false); + Impl::test_initstruct_args(args); +} + +TEST_F( defaultdevicetypeinit, initstruct_device) { + Kokkos::InitArguments args = Impl::init_initstruct(false,false,true); + Impl::test_initstruct_args(args); +} + +TEST_F( defaultdevicetypeinit, initstruct_nthreads_device) { + Kokkos::InitArguments args = Impl::init_initstruct(true,false,true); + Impl::test_initstruct_args(args); +} + + +TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa_device) { + Kokkos::InitArguments args = Impl::init_initstruct(true,true,true); + Impl::test_initstruct_args(args); +} + + + +} // namespace test + +#endif diff --git a/lib/kokkos/core/unit_test/TestHWLOC.cpp b/lib/kokkos/core/unit_test/TestHWLOC.cpp new file mode 100755 index 0000000000000000000000000000000000000000..1637dec5de4ff762cfbd259ee47932b5e85eb4d0 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestHWLOC.cpp @@ -0,0 +1,69 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <iostream> +#include <Kokkos_hwloc.hpp> + +namespace Test { + +class hwloc : public ::testing::Test { +protected: + static void SetUpTestCase() + {} + + static void TearDownTestCase() + {} +}; + +TEST_F( hwloc, query) +{ + std::cout << " NUMA[" << Kokkos::hwloc::get_available_numa_count() << "]" + << " CORE[" << Kokkos::hwloc::get_available_cores_per_numa() << "]" + << " PU[" << Kokkos::hwloc::get_available_threads_per_core() << "]" + << std::endl ; +} + +} + diff --git a/lib/kokkos/core/unit_test/TestMemorySpaceTracking.hpp b/lib/kokkos/core/unit_test/TestMemorySpaceTracking.hpp new file mode 100755 index 0000000000000000000000000000000000000000..80ffcc2afd81c102638f20a62365b3b2a071fc6e --- /dev/null +++ b/lib/kokkos/core/unit_test/TestMemorySpaceTracking.hpp @@ -0,0 +1,100 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <iostream> +#include <Kokkos_Core.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace { + +template<class Arg1> +class TestMemorySpace { +public: + + typedef typename Arg1::memory_space MemorySpace; + TestMemorySpace() { run_test(); } + + void run_test() + { + +#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + + Kokkos::View<int* ,Arg1> invalid; + ASSERT_EQ(0u, invalid.tracker().ref_count() ); + + { + Kokkos::View<int* ,Arg1> a("A",10); + + ASSERT_EQ(1u, a.tracker().ref_count() ); + + { + Kokkos::View<int* ,Arg1> b = a; + ASSERT_EQ(2u, b.tracker().ref_count() ); + + Kokkos::View<int* ,Arg1> D("D",10); + ASSERT_EQ(1u, D.tracker().ref_count() ); + + { + Kokkos::View<int* ,Arg1> E("E",10); + ASSERT_EQ(1u, E.tracker().ref_count() ); + } + + ASSERT_EQ(2u, b.tracker().ref_count() ); + } + ASSERT_EQ(1u, a.tracker().ref_count() ); + } + +#endif + + } +}; + +} + +/*--------------------------------------------------------------------------*/ + + + diff --git a/lib/kokkos/core/unit_test/TestOpenMP.cpp b/lib/kokkos/core/unit_test/TestOpenMP.cpp new file mode 100755 index 0000000000000000000000000000000000000000..8d4bcd1e2625330594fbe12997d5ea6fb5b98c20 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestOpenMP.cpp @@ -0,0 +1,375 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +//---------------------------------------------------------------------------- + +#include <TestViewImpl.hpp> +#include <TestAtomic.hpp> + +#include <TestViewAPI.hpp> +#include <TestViewSubview.hpp> + +#include <TestSharedAlloc.hpp> +#include <TestViewMapping.hpp> + +#include <TestRange.hpp> +#include <TestTeam.hpp> +#include <TestReduce.hpp> +#include <TestScan.hpp> +#include <TestAggregate.hpp> +#include <TestAggregateReduction.hpp> +#include <TestCompilerMacros.hpp> +#include <TestCXX11.hpp> +#include <TestCXX11Deduction.hpp> +#include <TestTeamVector.hpp> +#include <TestMemorySpaceTracking.hpp> +#include <TestTemplateMetaFunctions.hpp> + +namespace Test { + +class openmp : public ::testing::Test { +protected: + static void SetUpTestCase() + { + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); + + const unsigned threads_count = std::max( 1u , numa_count ) * + std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 ); + + Kokkos::OpenMP::initialize( threads_count ); + Kokkos::OpenMP::print_configuration( std::cout , true ); + } + + static void TearDownTestCase() + { + Kokkos::OpenMP::finalize(); + + omp_set_num_threads(1); + + ASSERT_EQ( 1 , omp_get_max_threads() ); + } +}; + + +TEST_F( openmp , impl_shared_alloc ) { + test_shared_alloc< Kokkos::HostSpace , Kokkos::OpenMP >(); +} + +TEST_F( openmp , impl_view_mapping ) { + test_view_mapping< Kokkos::OpenMP >(); + test_view_mapping_subview< Kokkos::OpenMP >(); + test_view_mapping_operator< Kokkos::OpenMP >(); + TestViewMappingAtomic< Kokkos::OpenMP >::run(); +} + +TEST_F( openmp, view_impl) { + test_view_impl< Kokkos::OpenMP >(); +} + +TEST_F( openmp, view_api) { + TestViewAPI< double , Kokkos::OpenMP >(); +} + + +TEST_F( openmp, view_subview_auto_1d_left ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::OpenMP >(); +} + +TEST_F( openmp, view_subview_auto_1d_right ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::OpenMP >(); +} + +TEST_F( openmp, view_subview_auto_1d_stride ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::OpenMP >(); +} + +TEST_F( openmp, view_subview_assign_strided ) { + TestViewSubview::test_1d_strided_assignment< Kokkos::OpenMP >(); +} + +TEST_F( openmp, view_subview_left_0 ) { + TestViewSubview::test_left_0< Kokkos::OpenMP >(); +} + +TEST_F( openmp, view_subview_left_1 ) { + TestViewSubview::test_left_1< Kokkos::OpenMP >(); +} + +TEST_F( openmp, view_subview_left_2 ) { + TestViewSubview::test_left_2< Kokkos::OpenMP >(); +} + +TEST_F( openmp, view_subview_left_3 ) { + TestViewSubview::test_left_3< Kokkos::OpenMP >(); +} + +TEST_F( openmp, view_subview_right_0 ) { + TestViewSubview::test_right_0< Kokkos::OpenMP >(); +} + +TEST_F( openmp, view_subview_right_1 ) { + TestViewSubview::test_right_1< Kokkos::OpenMP >(); +} + +TEST_F( openmp, view_subview_right_3 ) { + TestViewSubview::test_right_3< Kokkos::OpenMP >(); +} + + + +TEST_F( openmp , range_tag ) +{ + TestRange< Kokkos::OpenMP >::test_for(1000); + TestRange< Kokkos::OpenMP >::test_reduce(1000); + TestRange< Kokkos::OpenMP >::test_scan(1000); +} + +TEST_F( openmp , team_tag ) +{ + TestTeamPolicy< Kokkos::OpenMP >::test_for(1000); + TestTeamPolicy< Kokkos::OpenMP >::test_reduce(1000); +} + +TEST_F( openmp, long_reduce) { + TestReduce< long , Kokkos::OpenMP >( 1000000 ); +} + +TEST_F( openmp, double_reduce) { + TestReduce< double , Kokkos::OpenMP >( 1000000 ); +} + +TEST_F( openmp, long_reduce_dynamic ) { + TestReduceDynamic< long , Kokkos::OpenMP >( 1000000 ); +} + +TEST_F( openmp, double_reduce_dynamic ) { + TestReduceDynamic< double , Kokkos::OpenMP >( 1000000 ); +} + +TEST_F( openmp, long_reduce_dynamic_view ) { + TestReduceDynamicView< long , Kokkos::OpenMP >( 1000000 ); +} + +TEST_F( openmp, team_long_reduce) { + TestReduceTeam< long , Kokkos::OpenMP >( 100000 ); +} + +TEST_F( openmp, team_double_reduce) { + TestReduceTeam< double , Kokkos::OpenMP >( 100000 ); +} + +TEST_F( openmp, team_shared_request) { + TestSharedTeam< Kokkos::OpenMP >(); +} + + +TEST_F( openmp , atomics ) +{ + const int loop_count = 1e4 ; + + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,3) ) ); + +#if defined( KOKKOS_ENABLE_ASM ) + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,3) ) ); +#endif +} + +TEST_F( openmp , view_remap ) +{ + enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 }; + + typedef Kokkos::View< double*[N1][N2][N3] , + Kokkos::LayoutRight , + Kokkos::OpenMP > output_type ; + + typedef Kokkos::View< int**[N2][N3] , + Kokkos::LayoutLeft , + Kokkos::OpenMP > input_type ; + + typedef Kokkos::View< int*[N0][N2][N3] , + Kokkos::LayoutLeft , + Kokkos::OpenMP > diff_type ; + + output_type output( "output" , N0 ); + input_type input ( "input" , N0 , N1 ); + diff_type diff ( "diff" , N0 ); + + int value = 0 ; + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) { + input(i0,i1,i2,i3) = ++value ; + }}}} + + // Kokkos::deep_copy( diff , input ); // throw with incompatible shape + Kokkos::deep_copy( output , input ); + + value = 0 ; + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) { + ++value ; + ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) ); + }}}} +} + +//---------------------------------------------------------------------------- + + +TEST_F( openmp , view_aggregate ) +{ + TestViewAggregate< Kokkos::OpenMP >(); + TestViewAggregateReduction< Kokkos::OpenMP >(); +} + +//---------------------------------------------------------------------------- + +TEST_F( openmp , scan ) +{ + TestScan< Kokkos::OpenMP >::test_range( 1 , 1000 ); + TestScan< Kokkos::OpenMP >( 1000000 ); + TestScan< Kokkos::OpenMP >( 10000000 ); + Kokkos::OpenMP::fence(); +} + + +TEST_F( openmp , team_scan ) +{ + TestScanTeam< Kokkos::OpenMP >( 10000 ); + TestScanTeam< Kokkos::OpenMP >( 10000 ); +} + +//---------------------------------------------------------------------------- + +TEST_F( openmp , compiler_macros ) +{ + ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::OpenMP >() ) ); +} + +//---------------------------------------------------------------------------- + +TEST_F( openmp , memory_space ) +{ + TestMemorySpace< Kokkos::OpenMP >(); +} + +//---------------------------------------------------------------------------- + +TEST_F( openmp , template_meta_functions ) +{ + TestTemplateMetaFunctions<int, Kokkos::OpenMP >(); +} + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_HAVE_CXX11 ) && defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) +TEST_F( openmp , cxx11 ) +{ + if ( Kokkos::Impl::is_same< Kokkos::DefaultExecutionSpace , Kokkos::OpenMP >::value ) { + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(1) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(2) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(3) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(4) ) ); + } +} +#endif + +#if defined (KOKKOS_HAVE_CXX11) +TEST_F( openmp , reduction_deduction ) +{ + TestCXX11::test_reduction_deduction< Kokkos::OpenMP >(); +} + +TEST_F( openmp , team_vector ) +{ + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(0) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(1) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(2) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(3) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(4) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(5) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(6) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(7) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(8) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(9) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(10) ) ); +} +#endif +} // namespace test + diff --git a/lib/kokkos/core/unit_test/TestQthread.cpp b/lib/kokkos/core/unit_test/TestQthread.cpp new file mode 100755 index 0000000000000000000000000000000000000000..19bfa6bde4cc379370eee7501adc9926573580a5 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestQthread.cpp @@ -0,0 +1,283 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <Kokkos_Qthread.hpp> + +#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp> + +//---------------------------------------------------------------------------- + +#include <TestViewImpl.hpp> +#include <TestAtomic.hpp> + +#include <TestViewAPI.hpp> + +#include <TestTeam.hpp> +#include <TestRange.hpp> +#include <TestReduce.hpp> +#include <TestScan.hpp> +#include <TestAggregate.hpp> +#include <TestCompilerMacros.hpp> +#include <TestTaskPolicy.hpp> +// #include <TestTeamVector.hpp> + +namespace Test { + +class qthread : public ::testing::Test { +protected: + static void SetUpTestCase() + { + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); + + int threads_count = std::max( 1u , numa_count ) + * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 ); + Kokkos::Qthread::initialize( threads_count ); + Kokkos::Qthread::print_configuration( std::cout , true ); + } + + static void TearDownTestCase() + { + Kokkos::Qthread::finalize(); + } +}; + +TEST_F( qthread , compiler_macros ) +{ + ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Qthread >() ) ); +} + +TEST_F( qthread, view_impl) { + test_view_impl< Kokkos::Qthread >(); +} + +TEST_F( qthread, view_api) { + TestViewAPI< double , Kokkos::Qthread >(); +} + +TEST_F( qthread , range_tag ) +{ + TestRange< Kokkos::Qthread >::test_for(1000); + TestRange< Kokkos::Qthread >::test_reduce(1000); + TestRange< Kokkos::Qthread >::test_scan(1000); +} + +TEST_F( qthread , team_tag ) +{ + TestTeamPolicy< Kokkos::Qthread >::test_for( 1000 ); + TestTeamPolicy< Kokkos::Qthread >::test_reduce( 1000 ); +} + +TEST_F( qthread, long_reduce) { + TestReduce< long , Kokkos::Qthread >( 1000000 ); +} + +TEST_F( qthread, double_reduce) { + TestReduce< double , Kokkos::Qthread >( 1000000 ); +} + +TEST_F( qthread, long_reduce_dynamic ) { + TestReduceDynamic< long , Kokkos::Qthread >( 1000000 ); +} + +TEST_F( qthread, double_reduce_dynamic ) { + TestReduceDynamic< double , Kokkos::Qthread >( 1000000 ); +} + +TEST_F( qthread, long_reduce_dynamic_view ) { + TestReduceDynamicView< long , Kokkos::Qthread >( 1000000 ); +} + +TEST_F( qthread, team_long_reduce) { + TestReduceTeam< long , Kokkos::Qthread >( 1000000 ); +} + +TEST_F( qthread, team_double_reduce) { + TestReduceTeam< double , Kokkos::Qthread >( 1000000 ); +} + + +TEST_F( qthread , atomics ) +{ + const int loop_count = 1e4 ; + + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,3) ) ); + +#if defined( KOKKOS_ENABLE_ASM ) + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,3) ) ); +#endif + +} + +TEST_F( qthread , view_remap ) +{ + enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 }; + + typedef Kokkos::View< double*[N1][N2][N3] , + Kokkos::LayoutRight , + Kokkos::Qthread > output_type ; + + typedef Kokkos::View< int**[N2][N3] , + Kokkos::LayoutLeft , + Kokkos::Qthread > input_type ; + + typedef Kokkos::View< int*[N0][N2][N3] , + Kokkos::LayoutLeft , + Kokkos::Qthread > diff_type ; + + output_type output( "output" , N0 ); + input_type input ( "input" , N0 , N1 ); + diff_type diff ( "diff" , N0 ); + + int value = 0 ; + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) { + input(i0,i1,i2,i3) = ++value ; + }}}} + + // Kokkos::deep_copy( diff , input ); // throw with incompatible shape + Kokkos::deep_copy( output , input ); + + value = 0 ; + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) { + ++value ; + ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) ); + }}}} +} + +//---------------------------------------------------------------------------- + +TEST_F( qthread , view_aggregate ) +{ + TestViewAggregate< Kokkos::Qthread >(); +} + +//---------------------------------------------------------------------------- + +TEST_F( qthread , scan ) +{ + TestScan< Kokkos::Qthread >::test_range( 1 , 1000 ); + TestScan< Kokkos::Qthread >( 1000000 ); + TestScan< Kokkos::Qthread >( 10000000 ); + Kokkos::Qthread::fence(); +} + +TEST_F( qthread, team_shared ) { + TestSharedTeam< Kokkos::Qthread >(); +} + +TEST_F( qthread , team_scan ) +{ + TestScanTeam< Kokkos::Qthread >( 10 ); + TestScanTeam< Kokkos::Qthread >( 10000 ); +} + +#if defined (KOKKOS_HAVE_CXX11) && 0 /* disable */ +TEST_F( qthread , team_vector ) +{ + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(0) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(1) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(2) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(3) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(4) ) ); +} +#endif + +//---------------------------------------------------------------------------- + +TEST_F( qthread , task_policy ) +{ + TestTaskPolicy::test_task_dep< Kokkos::Qthread >( 10 ); + for ( long i = 0 ; i < 25 ; ++i ) TestTaskPolicy::test_fib< Kokkos::Qthread >(i); + for ( long i = 0 ; i < 35 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Qthread >(i); +} + +#if defined( KOKKOS_HAVE_CXX11 ) +TEST_F( qthread , task_team ) +{ + std::cout << "qthread.task_team test disabled due to unresolved error causing the test to hang." << std::endl ; + // TestTaskPolicy::test_task_team< Kokkos::Qthread >(1000); +} +#endif + +//---------------------------------------------------------------------------- + +} // namespace test + diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp new file mode 100755 index 0000000000000000000000000000000000000000..1af53132723209831c3a28384523f539bc456720 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestRange.hpp @@ -0,0 +1,171 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdio.h> + +#include <Kokkos_Core.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Test { +namespace { + +template< class ExecSpace > +struct TestRange { + + typedef int value_type ; ///< typedef required for the parallel_reduce + + typedef Kokkos::View<int*,ExecSpace> view_type ; + + view_type m_flags ; + + struct VerifyInitTag {}; + struct ResetTag {}; + struct VerifyResetTag {}; + + TestRange( const size_t N ) + : m_flags( Kokkos::ViewAllocateWithoutInitializing("flags"), N ) + {} + + static void test_for( const size_t N ) + { + TestRange functor(N); + + typename view_type::HostMirror host_flags = Kokkos::create_mirror_view( functor.m_flags ); + + Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace>(0,N) , functor ); + Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,VerifyInitTag>(0,N) , functor ); + + Kokkos::deep_copy( host_flags , functor.m_flags ); + + size_t error_count = 0 ; + for ( size_t i = 0 ; i < N ; ++i ) { + if ( int(i) != host_flags(i) ) ++error_count ; + } + ASSERT_EQ( error_count , size_t(0) ); + + Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ResetTag>(0,N) , functor ); + Kokkos::parallel_for( std::string("TestKernelFor") , Kokkos::RangePolicy<ExecSpace,VerifyResetTag>(0,N) , functor ); + + Kokkos::deep_copy( host_flags , functor.m_flags ); + + error_count = 0 ; + for ( size_t i = 0 ; i < N ; ++i ) { + if ( int(2*i) != host_flags(i) ) ++error_count ; + } + ASSERT_EQ( error_count , size_t(0) ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const int i ) const + { m_flags(i) = i ; } + + KOKKOS_INLINE_FUNCTION + void operator()( const VerifyInitTag & , const int i ) const + { if ( i != m_flags(i) ) { printf("TestRange::test_for error at %d != %d\n",i,m_flags(i)); } } + + KOKKOS_INLINE_FUNCTION + void operator()( const ResetTag & , const int i ) const + { m_flags(i) = 2 * m_flags(i); } + + KOKKOS_INLINE_FUNCTION + void operator()( const VerifyResetTag & , const int i ) const + { if ( 2 * i != m_flags(i) ) { printf("TestRange::test_for error at %d != %d\n",i,m_flags(i)); } } + + //---------------------------------------- + + struct OffsetTag {}; + + static void test_reduce( const size_t N ) + { + TestRange functor(N); + int total = 0 ; + + Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace>(0,N) , functor ); + + Kokkos::parallel_reduce( "TestKernelReduce" , Kokkos::RangePolicy<ExecSpace>(0,N) , functor , total ); + // sum( 0 .. N-1 ) + ASSERT_EQ( size_t((N-1)*(N)/2) , size_t(total) ); + + Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,OffsetTag>(0,N) , functor , total ); + // sum( 1 .. N ) + ASSERT_EQ( size_t((N)*(N+1)/2) , size_t(total) ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const int i , value_type & update ) const + { update += m_flags(i); } + + KOKKOS_INLINE_FUNCTION + void operator()( const OffsetTag & , const int i , value_type & update ) const + { update += 1 + m_flags(i); } + + //---------------------------------------- + + static void test_scan( const size_t N ) + { + TestRange functor(N); + + Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace>(0,N) , functor ); + + Kokkos::parallel_scan( "TestKernelScan" , Kokkos::RangePolicy<ExecSpace,OffsetTag>(0,N) , functor ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const OffsetTag & , const int i , value_type & update , bool final ) const + { + update += m_flags(i); + + if ( final ) { + if ( update != (i*(i+1))/2 ) { + printf("TestRange::test_scan error %d : %d != %d\n",i,(i*(i+1))/2,m_flags(i)); + } + } + } +}; + +} /* namespace */ +} /* namespace Test */ + +/*--------------------------------------------------------------------------*/ + diff --git a/lib/kokkos/core/unit_test/TestReduce.hpp b/lib/kokkos/core/unit_test/TestReduce.hpp new file mode 100755 index 0000000000000000000000000000000000000000..30b94d40fb43a854fc85352c7a779a32f4cf32ea --- /dev/null +++ b/lib/kokkos/core/unit_test/TestReduce.hpp @@ -0,0 +1,371 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +template< typename ScalarType , class DeviceType > +class ReduceFunctor +{ +public: + typedef DeviceType execution_space ; + typedef typename execution_space::size_type size_type ; + + struct value_type { + ScalarType value[3] ; + }; + + const size_type nwork ; + + ReduceFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {} + + ReduceFunctor( const ReduceFunctor & rhs ) + : nwork( rhs.nwork ) {} + +/* + KOKKOS_INLINE_FUNCTION + void init( value_type & dst ) const + { + dst.value[0] = 0 ; + dst.value[1] = 0 ; + dst.value[2] = 0 ; + } +*/ + + KOKKOS_INLINE_FUNCTION + void join( volatile value_type & dst , + const volatile value_type & src ) const + { + dst.value[0] += src.value[0] ; + dst.value[1] += src.value[1] ; + dst.value[2] += src.value[2] ; + } + + KOKKOS_INLINE_FUNCTION + void operator()( size_type iwork , value_type & dst ) const + { + dst.value[0] += 1 ; + dst.value[1] += iwork + 1 ; + dst.value[2] += nwork - iwork ; + } +}; + +template< class DeviceType > +class ReduceFunctorFinal : public ReduceFunctor< long , DeviceType > { +public: + + typedef typename ReduceFunctor< long , DeviceType >::value_type value_type ; + + ReduceFunctorFinal( const size_t n ) + : ReduceFunctor<long,DeviceType>(n) + {} + + KOKKOS_INLINE_FUNCTION + void final( value_type & dst ) const + { + dst.value[0] = - dst.value[0] ; + dst.value[1] = - dst.value[1] ; + dst.value[2] = - dst.value[2] ; + } +}; + +template< typename ScalarType , class DeviceType > +class RuntimeReduceFunctor +{ +public: + // Required for functor: + typedef DeviceType execution_space ; + typedef ScalarType value_type[] ; + const unsigned value_count ; + + + // Unit test details: + + typedef typename execution_space::size_type size_type ; + + const size_type nwork ; + + RuntimeReduceFunctor( const size_type arg_nwork , + const size_type arg_count ) + : value_count( arg_count ) + , nwork( arg_nwork ) {} + +/* + KOKKOS_INLINE_FUNCTION + void init( value_type dst ) const + { + for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] = 0 ; + } +*/ + + KOKKOS_INLINE_FUNCTION + void join( volatile ScalarType dst[] , + const volatile ScalarType src[] ) const + { + for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] += src[i] ; + } + + KOKKOS_INLINE_FUNCTION + void operator()( size_type iwork , ScalarType dst[] ) const + { + const size_type tmp[3] = { 1 , iwork + 1 , nwork - iwork }; + + for ( size_type i = 0 ; i < value_count ; ++i ) { + dst[i] += tmp[ i % 3 ]; + } + } +}; + +template< class DeviceType > +class RuntimeReduceFunctorFinal : public RuntimeReduceFunctor< long , DeviceType > { +public: + + typedef RuntimeReduceFunctor< long , DeviceType > base_type ; + typedef typename base_type::value_type value_type ; + typedef long scalar_type ; + + RuntimeReduceFunctorFinal( const size_t theNwork , const size_t count ) : base_type(theNwork,count) {} + + KOKKOS_INLINE_FUNCTION + void final( value_type dst ) const + { + for ( unsigned i = 0 ; i < base_type::value_count ; ++i ) { + dst[i] = - dst[i] ; + } + } +}; +} // namespace Test + +namespace { + +template< typename ScalarType , class DeviceType > +class TestReduce +{ +public: + typedef DeviceType execution_space ; + typedef typename execution_space::size_type size_type ; + + //------------------------------------ + + TestReduce( const size_type & nwork ) + { + run_test(nwork); + run_test_final(nwork); + } + + void run_test( const size_type & nwork ) + { + typedef Test::ReduceFunctor< ScalarType , execution_space > functor_type ; + typedef typename functor_type::value_type value_type ; + + enum { Count = 3 }; + enum { Repeat = 100 }; + + value_type result[ Repeat ]; + + const unsigned long nw = nwork ; + const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 ) + : (nw/2) * ( nw + 1 ); + + for ( unsigned i = 0 ; i < Repeat ; ++i ) { + Kokkos::parallel_reduce( nwork , functor_type(nwork) , result[i] ); + } + + for ( unsigned i = 0 ; i < Repeat ; ++i ) { + for ( unsigned j = 0 ; j < Count ; ++j ) { + const unsigned long correct = 0 == j % 3 ? nw : nsum ; + ASSERT_EQ( (ScalarType) correct , result[i].value[j] ); + } + } + } + + void run_test_final( const size_type & nwork ) + { + typedef Test::ReduceFunctorFinal< execution_space > functor_type ; + typedef typename functor_type::value_type value_type ; + + enum { Count = 3 }; + enum { Repeat = 100 }; + + value_type result[ Repeat ]; + + const unsigned long nw = nwork ; + const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 ) + : (nw/2) * ( nw + 1 ); + + for ( unsigned i = 0 ; i < Repeat ; ++i ) { + Kokkos::parallel_reduce( nwork , functor_type(nwork) , result[i] ); + } + + for ( unsigned i = 0 ; i < Repeat ; ++i ) { + for ( unsigned j = 0 ; j < Count ; ++j ) { + const unsigned long correct = 0 == j % 3 ? nw : nsum ; + ASSERT_EQ( (ScalarType) correct , - result[i].value[j] ); + } + } + } +}; + +template< typename ScalarType , class DeviceType > +class TestReduceDynamic +{ +public: + typedef DeviceType execution_space ; + typedef typename execution_space::size_type size_type ; + + //------------------------------------ + + TestReduceDynamic( const size_type nwork ) + { + run_test_dynamic(nwork); + run_test_dynamic_final(nwork); + } + + void run_test_dynamic( const size_type nwork ) + { + typedef Test::RuntimeReduceFunctor< ScalarType , execution_space > functor_type ; + + enum { Count = 3 }; + enum { Repeat = 100 }; + + ScalarType result[ Repeat ][ Count ] ; + + const unsigned long nw = nwork ; + const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 ) + : (nw/2) * ( nw + 1 ); + + for ( unsigned i = 0 ; i < Repeat ; ++i ) { + Kokkos::parallel_reduce( nwork , functor_type(nwork,Count) , result[i] ); + } + + for ( unsigned i = 0 ; i < Repeat ; ++i ) { + for ( unsigned j = 0 ; j < Count ; ++j ) { + const unsigned long correct = 0 == j % 3 ? nw : nsum ; + ASSERT_EQ( (ScalarType) correct , result[i][j] ); + } + } + } + + void run_test_dynamic_final( const size_type nwork ) + { + typedef Test::RuntimeReduceFunctorFinal< execution_space > functor_type ; + + enum { Count = 3 }; + enum { Repeat = 100 }; + + typename functor_type::scalar_type result[ Repeat ][ Count ] ; + + const unsigned long nw = nwork ; + const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 ) + : (nw/2) * ( nw + 1 ); + + for ( unsigned i = 0 ; i < Repeat ; ++i ) { + Kokkos::parallel_reduce( "TestKernelReduce" , nwork , functor_type(nwork,Count) , result[i] ); + } + + for ( unsigned i = 0 ; i < Repeat ; ++i ) { + for ( unsigned j = 0 ; j < Count ; ++j ) { + const unsigned long correct = 0 == j % 3 ? nw : nsum ; + ASSERT_EQ( (ScalarType) correct , - result[i][j] ); + } + } + } +}; + +template< typename ScalarType , class DeviceType > +class TestReduceDynamicView +{ +public: + typedef DeviceType execution_space ; + typedef typename execution_space::size_type size_type ; + + //------------------------------------ + + TestReduceDynamicView( const size_type nwork ) + { + run_test_dynamic_view(nwork); + } + + void run_test_dynamic_view( const size_type nwork ) + { + typedef Test::RuntimeReduceFunctor< ScalarType , execution_space > functor_type ; + + typedef Kokkos::View< ScalarType* , DeviceType > result_type ; + typedef typename result_type::HostMirror result_host_type ; + + const unsigned CountLimit = 23 ; + + const unsigned long nw = nwork ; + const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 ) + : (nw/2) * ( nw + 1 ); + + for ( unsigned count = 0 ; count < CountLimit ; ++count ) { + + result_type result("result",count); + result_host_type host_result = Kokkos::create_mirror( result ); + + // Test result to host pointer: + + std::string str("TestKernelReduce"); + Kokkos::parallel_reduce( str , nw , functor_type(nw,count) , host_result.ptr_on_device() ); + + for ( unsigned j = 0 ; j < count ; ++j ) { + const unsigned long correct = 0 == j % 3 ? nw : nsum ; + ASSERT_EQ( host_result(j), (ScalarType) correct ); + host_result(j) = 0 ; + } + } + } +}; + +} + +/*--------------------------------------------------------------------------*/ + diff --git a/lib/kokkos/core/unit_test/TestScan.hpp b/lib/kokkos/core/unit_test/TestScan.hpp new file mode 100755 index 0000000000000000000000000000000000000000..eb5e833a1d1ddfddf89ed858d80144d38192c182 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestScan.hpp @@ -0,0 +1,97 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/*--------------------------------------------------------------------------*/ + +#include <stdio.h> + +namespace Test { + +template< class Device , class WorkSpec = size_t > +struct TestScan { + + typedef Device execution_space ; + typedef long int value_type ; + + KOKKOS_INLINE_FUNCTION + void operator()( const int iwork , value_type & update , const bool final_pass ) const + { + const value_type n = iwork + 1 ; + const value_type imbalance = ( (1000 <= n) && (0 == n % 1000) ) ? 1000 : 0 ; + + // Insert an artificial load imbalance + + for ( value_type i = 0 ; i < imbalance ; ++i ) { ++update ; } + + update += n - imbalance ; + + if ( final_pass ) { + const value_type answer = n & 1 ? ( n * ( ( n + 1 ) / 2 ) ) : ( ( n / 2 ) * ( n + 1 ) ); + + if ( answer != update ) { + printf("TestScan(%d,%ld) != %ld\n",iwork,update,answer); + } + } + } + + KOKKOS_INLINE_FUNCTION + void init( value_type & update ) const { update = 0 ; } + + KOKKOS_INLINE_FUNCTION + void join( volatile value_type & update , + volatile const value_type & input ) const + { update += input ; } + + TestScan( const WorkSpec & N ) + { parallel_scan( N , *this ); } + + static void test_range( const WorkSpec & begin , const WorkSpec & end ) + { + for ( WorkSpec i = begin ; i < end ; ++i ) { + (void) TestScan( i ); + } + } +}; + +} + diff --git a/lib/kokkos/core/unit_test/TestSerial.cpp b/lib/kokkos/core/unit_test/TestSerial.cpp new file mode 100755 index 0000000000000000000000000000000000000000..dbe94005e80e3b6ef80f6579135ffd199ba2bf26 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestSerial.cpp @@ -0,0 +1,419 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + +#include <impl/Kokkos_ViewTileLeft.hpp> +#include <TestTile.hpp> + +#endif + +#include <impl/Kokkos_Serial_TaskPolicy.hpp> + +//---------------------------------------------------------------------------- + +#include <TestSharedAlloc.hpp> +#include <TestViewMapping.hpp> + +#include <TestViewImpl.hpp> + +#include <TestViewAPI.hpp> +#include <TestViewOfClass.hpp> +#include <TestViewSubview.hpp> +#include <TestAtomic.hpp> +#include <TestRange.hpp> +#include <TestTeam.hpp> +#include <TestReduce.hpp> +#include <TestScan.hpp> +#include <TestAggregate.hpp> +#include <TestAggregateReduction.hpp> +#include <TestCompilerMacros.hpp> +#include <TestTaskPolicy.hpp> +#include <TestCXX11.hpp> +#include <TestCXX11Deduction.hpp> +#include <TestTeamVector.hpp> +#include <TestMemorySpaceTracking.hpp> +#include <TestTemplateMetaFunctions.hpp> + +namespace Test { + +class serial : public ::testing::Test { +protected: + static void SetUpTestCase() + { + Kokkos::HostSpace::execution_space::initialize(); + } + static void TearDownTestCase() + { + Kokkos::HostSpace::execution_space::finalize(); + } +}; + +TEST_F( serial , impl_shared_alloc ) { + test_shared_alloc< Kokkos::HostSpace , Kokkos::Serial >(); +} + +TEST_F( serial , impl_view_mapping ) { + test_view_mapping< Kokkos::Serial >(); + test_view_mapping_subview< Kokkos::Serial >(); + test_view_mapping_operator< Kokkos::Serial >(); + TestViewMappingAtomic< Kokkos::Serial >::run(); +} + +TEST_F( serial, view_impl) { + test_view_impl< Kokkos::Serial >(); +} + +TEST_F( serial, view_api) { + TestViewAPI< double , Kokkos::Serial >(); +} + +TEST_F( serial , view_nested_view ) +{ + ::Test::view_nested_view< Kokkos::Serial >(); +} + +TEST_F( serial, view_subview_auto_1d_left ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Serial >(); +} + +TEST_F( serial, view_subview_auto_1d_right ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Serial >(); +} + +TEST_F( serial, view_subview_auto_1d_stride ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Serial >(); +} + +TEST_F( serial, view_subview_assign_strided ) { + TestViewSubview::test_1d_strided_assignment< Kokkos::Serial >(); +} + +TEST_F( serial, view_subview_left_0 ) { + TestViewSubview::test_left_0< Kokkos::Serial >(); +} + +TEST_F( serial, view_subview_left_1 ) { + TestViewSubview::test_left_1< Kokkos::Serial >(); +} + +TEST_F( serial, view_subview_left_2 ) { + TestViewSubview::test_left_2< Kokkos::Serial >(); +} + +TEST_F( serial, view_subview_left_3 ) { + TestViewSubview::test_left_3< Kokkos::Serial >(); +} + +TEST_F( serial, view_subview_right_0 ) { + TestViewSubview::test_right_0< Kokkos::Serial >(); +} + +TEST_F( serial, view_subview_right_1 ) { + TestViewSubview::test_right_1< Kokkos::Serial >(); +} + +TEST_F( serial, view_subview_right_3 ) { + TestViewSubview::test_right_3< Kokkos::Serial >(); +} + +TEST_F( serial , range_tag ) +{ + TestRange< Kokkos::Serial >::test_for(1000); + TestRange< Kokkos::Serial >::test_reduce(1000); + TestRange< Kokkos::Serial >::test_scan(1000); +} + +TEST_F( serial , team_tag ) +{ + TestTeamPolicy< Kokkos::Serial >::test_for( 1000 ); + TestTeamPolicy< Kokkos::Serial >::test_reduce( 1000 ); +} + +TEST_F( serial, long_reduce) { + TestReduce< long , Kokkos::Serial >( 1000000 ); +} + +TEST_F( serial, double_reduce) { + TestReduce< double , Kokkos::Serial >( 1000000 ); +} + +TEST_F( serial, long_reduce_dynamic ) { + TestReduceDynamic< long , Kokkos::Serial >( 1000000 ); +} + +TEST_F( serial, double_reduce_dynamic ) { + TestReduceDynamic< double , Kokkos::Serial >( 1000000 ); +} + +TEST_F( serial, long_reduce_dynamic_view ) { + TestReduceDynamicView< long , Kokkos::Serial >( 1000000 ); +} + +TEST_F( serial , scan ) +{ + TestScan< Kokkos::Serial >::test_range( 1 , 1000 ); + TestScan< Kokkos::Serial >( 10 ); + TestScan< Kokkos::Serial >( 10000 ); +} + +TEST_F( serial , team_long_reduce) { + TestReduceTeam< long , Kokkos::Serial >( 100000 ); +} + +TEST_F( serial , team_double_reduce) { + TestReduceTeam< double , Kokkos::Serial >( 100000 ); +} + +TEST_F( serial , team_shared_request) { + TestSharedTeam< Kokkos::Serial >(); +} + +TEST_F( serial , team_scan ) +{ + TestScanTeam< Kokkos::Serial >( 10 ); + TestScanTeam< Kokkos::Serial >( 10000 ); +} + + +TEST_F( serial , view_remap ) +{ + enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 }; + + typedef Kokkos::View< double*[N1][N2][N3] , + Kokkos::LayoutRight , + Kokkos::Serial > output_type ; + + typedef Kokkos::View< int**[N2][N3] , + Kokkos::LayoutLeft , + Kokkos::Serial > input_type ; + + typedef Kokkos::View< int*[N0][N2][N3] , + Kokkos::LayoutLeft , + Kokkos::Serial > diff_type ; + + output_type output( "output" , N0 ); + input_type input ( "input" , N0 , N1 ); + diff_type diff ( "diff" , N0 ); + + int value = 0 ; + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) { + input(i0,i1,i2,i3) = ++value ; + }}}} + + // Kokkos::deep_copy( diff , input ); // throw with incompatible shape + Kokkos::deep_copy( output , input ); + + value = 0 ; + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) { + ++value ; + ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) ); + }}}} +} + +//---------------------------------------------------------------------------- + +TEST_F( serial , view_aggregate ) +{ + TestViewAggregate< Kokkos::Serial >(); + TestViewAggregateReduction< Kokkos::Serial >(); +} + +//---------------------------------------------------------------------------- + +TEST_F( serial , atomics ) +{ + const int loop_count = 1e6 ; + + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,3) ) ); +} + +//---------------------------------------------------------------------------- + +#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + +TEST_F( serial, tile_layout ) +{ + TestTile::test< Kokkos::Serial , 1 , 1 >( 1 , 1 ); + TestTile::test< Kokkos::Serial , 1 , 1 >( 2 , 3 ); + TestTile::test< Kokkos::Serial , 1 , 1 >( 9 , 10 ); + + TestTile::test< Kokkos::Serial , 2 , 2 >( 1 , 1 ); + TestTile::test< Kokkos::Serial , 2 , 2 >( 2 , 3 ); + TestTile::test< Kokkos::Serial , 2 , 2 >( 4 , 4 ); + TestTile::test< Kokkos::Serial , 2 , 2 >( 9 , 9 ); + + TestTile::test< Kokkos::Serial , 2 , 4 >( 9 , 9 ); + TestTile::test< Kokkos::Serial , 4 , 2 >( 9 , 9 ); + + TestTile::test< Kokkos::Serial , 4 , 4 >( 1 , 1 ); + TestTile::test< Kokkos::Serial , 4 , 4 >( 4 , 4 ); + TestTile::test< Kokkos::Serial , 4 , 4 >( 9 , 9 ); + TestTile::test< Kokkos::Serial , 4 , 4 >( 9 , 11 ); + + TestTile::test< Kokkos::Serial , 8 , 8 >( 1 , 1 ); + TestTile::test< Kokkos::Serial , 8 , 8 >( 4 , 4 ); + TestTile::test< Kokkos::Serial , 8 , 8 >( 9 , 9 ); + TestTile::test< Kokkos::Serial , 8 , 8 >( 9 , 11 ); +} + +#endif + +//---------------------------------------------------------------------------- + +TEST_F( serial , compiler_macros ) +{ + ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Serial >() ) ); +} + +//---------------------------------------------------------------------------- + +TEST_F( serial , memory_space ) +{ + TestMemorySpace< Kokkos::Serial >(); +} + +//---------------------------------------------------------------------------- + +TEST_F( serial , task_policy ) +{ + TestTaskPolicy::test_task_dep< Kokkos::Serial >( 10 ); + // TestTaskPolicy::test_norm2< Kokkos::Serial >( 1000 ); + // for ( long i = 0 ; i < 30 ; ++i ) TestTaskPolicy::test_fib< Kokkos::Serial >(i); + // for ( long i = 0 ; i < 40 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Serial >(i); + for ( long i = 0 ; i < 20 ; ++i ) TestTaskPolicy::test_fib< Kokkos::Serial >(i); + for ( long i = 0 ; i < 25 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Serial >(i); +} + +#if defined( KOKKOS_HAVE_CXX11 ) +TEST_F( serial , task_team ) +{ + TestTaskPolicy::test_task_team< Kokkos::Serial >(1000); +} +#endif + +//---------------------------------------------------------------------------- + +TEST_F( serial , template_meta_functions ) +{ + TestTemplateMetaFunctions<int, Kokkos::Serial >(); +} + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_HAVE_CXX11 ) && defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) +TEST_F( serial , cxx11 ) +{ + if ( Kokkos::Impl::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Serial >::value ) { + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(1) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(2) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(3) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(4) ) ); + } +} +#endif + +#if defined (KOKKOS_HAVE_CXX11) +TEST_F( serial , reduction_deduction ) +{ + TestCXX11::test_reduction_deduction< Kokkos::Serial >(); +} + +TEST_F( serial , team_vector ) +{ + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(0) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(1) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(2) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(3) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(4) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(5) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(6) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(7) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(8) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(9) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(10) ) ); +} +#endif + +} // namespace test + diff --git a/lib/kokkos/core/unit_test/TestSharedAlloc.hpp b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp new file mode 100755 index 0000000000000000000000000000000000000000..060f5f4605d1b70e76918f05b103a24d778bcd59 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp @@ -0,0 +1,204 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +struct SharedAllocDestroy { + + volatile int * count ; + + SharedAllocDestroy() = default ; + SharedAllocDestroy( int * arg ) : count( arg ) {} + + void destroy_shared_allocation() + { + Kokkos::atomic_fetch_add( count , 1 ); + } + +}; + +template< class MemorySpace , class ExecutionSpace > +void test_shared_alloc() +{ +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + + typedef const Kokkos::Experimental::Impl::SharedAllocationHeader Header ; + typedef Kokkos::Experimental::Impl::SharedAllocationTracker Tracker ; + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ; + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordMemS ; + typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , SharedAllocDestroy > RecordFull ; + + static_assert( sizeof(Tracker) == sizeof(int*), "SharedAllocationTracker has wrong size!" ); + + MemorySpace s ; + + const size_t N = 1200 ; + const size_t size = 8 ; + + RecordMemS * rarray[ N ]; + Header * harray[ N ]; + + RecordMemS ** const r = rarray ; + Header ** const h = harray ; + + Kokkos::RangePolicy< ExecutionSpace > range(0,N); + + //---------------------------------------- + { + Kokkos::parallel_for( range , [=]( size_t i ){ + char name[64] ; + sprintf(name,"test_%.2d",int(i)); + + r[i] = RecordMemS::allocate( s , name , size * ( i + 1 ) ); + h[i] = Header::get_header( r[i]->data() ); + + ASSERT_EQ( r[i]->use_count() , 0 ); + + for ( size_t j = 0 ; j < ( i / 10 ) + 1 ; ++j ) RecordBase::increment( r[i] ); + + ASSERT_EQ( r[i]->use_count() , ( i / 10 ) + 1 ); + ASSERT_EQ( r[i] , RecordMemS::get_record( r[i]->data() ) ); + }); + + // Sanity check for the whole set of allocation records to which this record belongs. + RecordBase::is_sane( r[0] ); + // RecordMemS::print_records( std::cout , s , true ); + + Kokkos::parallel_for( range , [=]( size_t i ){ + while ( 0 != ( r[i] = static_cast< RecordMemS *>( RecordBase::decrement( r[i] ) ) ) ) { + if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] ); + } + }); + } + //---------------------------------------- + { + int destroy_count = 0 ; + SharedAllocDestroy counter( & destroy_count ); + + Kokkos::parallel_for( range , [=]( size_t i ){ + char name[64] ; + sprintf(name,"test_%.2d",int(i)); + + RecordFull * rec = RecordFull::allocate( s , name , size * ( i + 1 ) ); + + rec->m_destroy = counter ; + + r[i] = rec ; + h[i] = Header::get_header( r[i]->data() ); + + ASSERT_EQ( r[i]->use_count() , 0 ); + + for ( size_t j = 0 ; j < ( i / 10 ) + 1 ; ++j ) RecordBase::increment( r[i] ); + + ASSERT_EQ( r[i]->use_count() , ( i / 10 ) + 1 ); + ASSERT_EQ( r[i] , RecordMemS::get_record( r[i]->data() ) ); + }); + + RecordBase::is_sane( r[0] ); + + Kokkos::parallel_for( range , [=]( size_t i ){ + while ( 0 != ( r[i] = static_cast< RecordMemS *>( RecordBase::decrement( r[i] ) ) ) ) { + if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] ); + } + }); + + ASSERT_EQ( destroy_count , int(N) ); + } + + //---------------------------------------- + { + int destroy_count = 0 ; + + { + RecordFull * rec = RecordFull::allocate( s , "test" , size ); + + // ... Construction of the allocated { rec->data() , rec->size() } + + // Copy destruction function object into the allocation record + rec->m_destroy = SharedAllocDestroy( & destroy_count ); + + // Start tracking, increments the use count from 0 to 1 + Tracker track( rec ); + + ASSERT_EQ( rec->use_count() , 1 ); + + // Verify construction / destruction increment + for ( size_t i = 0 ; i < N ; ++i ) { + ASSERT_EQ( rec->use_count() , 1 ); + { + Tracker local_tracker( rec ); + ASSERT_EQ( rec->use_count() , 2 ); + } + ASSERT_EQ( rec->use_count() , 1 ); + } + + Kokkos::parallel_for( range , [=]( size_t i ){ + Tracker local_tracker( rec ); + ASSERT_GT( rec->use_count() , 1 ); + }); + + ASSERT_EQ( rec->use_count() , 1 ); + + // Destruction of 'track' object deallocates the 'rec' and invokes the destroy function object. + } + + ASSERT_EQ( destroy_count , 1 ); + } + +#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */ + +} + + +} + diff --git a/lib/kokkos/core/unit_test/TestTaskPolicy.hpp b/lib/kokkos/core/unit_test/TestTaskPolicy.hpp new file mode 100755 index 0000000000000000000000000000000000000000..96a5ca3b01208e485a887aed9a7dce8d547f31fb --- /dev/null +++ b/lib/kokkos/core/unit_test/TestTaskPolicy.hpp @@ -0,0 +1,494 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#ifndef KOKKOS_UNITTEST_TASKPOLICY_HPP +#define KOKKOS_UNITTEST_TASKPOLICY_HPP + +#include <stdio.h> +#include <iostream> +#include <cmath> +#include <Kokkos_TaskPolicy.hpp> + +namespace TestTaskPolicy { + +//---------------------------------------------------------------------------- + +template< class ExecSpace > +struct FibChild { + + typedef long value_type ; + + Kokkos::Experimental::TaskPolicy<ExecSpace> policy ; + const value_type n ; + int has_nested ; + + FibChild( const Kokkos::Experimental::TaskPolicy<ExecSpace> & arg_policy + , const value_type arg_n ) + : policy(arg_policy,2) /* default dependence capacity = 2 */ + , n( arg_n ), has_nested(0) {} + + inline + void apply( value_type & result ) + { + if ( n < 2 ) { + + has_nested = -1 ; + + result = n ; + } + else { + if ( has_nested == 0 ) { + // Spawn new children and respawn myself to sum their results: + has_nested = 2 ; + + Kokkos::Experimental::respawn + ( policy + , this + , Kokkos::Experimental::spawn( policy , FibChild(policy,n-1) ) + , Kokkos::Experimental::spawn( policy , FibChild(policy,n-2) ) + ); + + } + else if ( has_nested == 2 ) { + + has_nested = -1 ; + + const Kokkos::Experimental::Future<long,ExecSpace> fib_1 = policy.get_dependence(this,0); + const Kokkos::Experimental::Future<long,ExecSpace> fib_2 = policy.get_dependence(this,1); + + result = fib_1.get() + fib_2.get(); + } + else { + fprintf(stderr,"FibChild(%ld) execution error\n",(long)n); + fflush(stderr); + } + } + } +}; + +template< class ExecSpace > +struct FibChild2 { + + typedef long value_type ; + + Kokkos::Experimental::TaskPolicy<ExecSpace> policy ; + const value_type n ; + int has_nested ; + + FibChild2( const Kokkos::Experimental::TaskPolicy<ExecSpace> & arg_policy + , const value_type arg_n ) + : policy(arg_policy,2) /* default dependence capacity = 2 */ + , n( arg_n ), has_nested(0) {} + + inline + void apply( value_type & result ) + { + if ( 0 == has_nested ) { + if ( n < 2 ) { + + has_nested = -1 ; + + result = n ; + } + else if ( n < 4 ) { + // Spawn new children and respawn myself to sum their results: + // result = Fib(n-1) + Fib(n-2) + has_nested = 2 ; + // Kokkos::respawn implements the following steps: + policy.clear_dependence( this ); + policy.add_dependence( this , Kokkos::Experimental::spawn( policy , FibChild2(policy,n-1) ) ); + policy.add_dependence( this , Kokkos::Experimental::spawn( policy , FibChild2(policy,n-2) ) ); + policy.respawn( this ); + } + else { + // Spawn new children and respawn myself to sum their results: + // result = Fib(n-1) + Fib(n-2) + // result = ( Fib(n-2) + Fib(n-3) ) + ( Fib(n-3) + Fib(n-4) ) + // result = ( ( Fib(n-3) + Fib(n-4) ) + Fib(n-3) ) + ( Fib(n-3) + Fib(n-4) ) + // result = 3 * Fib(n-3) + 2 * Fib(n-4) + has_nested = 4 ; + // Kokkos::Experimental::respawn implements the following steps: + policy.clear_dependence( this ); + policy.add_dependence( this , Kokkos::Experimental::spawn( policy , FibChild2(policy,n-3) ) ); + policy.add_dependence( this , Kokkos::Experimental::spawn( policy , FibChild2(policy,n-4) ) ); + policy.respawn( this ); + } + } + else if ( 2 == has_nested || 4 == has_nested ) { + const Kokkos::Experimental::Future<long,ExecSpace> fib_a = policy.get_dependence(this,0); + const Kokkos::Experimental::Future<long,ExecSpace> fib_b = policy.get_dependence(this,1); + + result = ( has_nested == 2 ) ? fib_a.get() + fib_b.get() + : 3 * fib_a.get() + 2 * fib_b.get() ; + + has_nested = -1 ; + } + else { + fprintf(stderr,"FibChild2(%ld) execution error\n",(long)n); + fflush(stderr); + } + } +}; + +namespace { + +long eval_fib( long n ) +{ + if ( n < 2 ) return n ; + + std::vector<long> fib(n+1); + + fib[0] = 0 ; + fib[1] = 1 ; + + for ( long i = 2 ; i <= n ; ++i ) { fib[i] = fib[i-2] + fib[i-1]; } + + return fib[n]; +} + +} + +template< class ExecSpace > +void test_fib( long n ) +{ + Kokkos::Experimental::TaskPolicy<ExecSpace> policy(2); + + Kokkos::Experimental::Future<long,ExecSpace> f = Kokkos::Experimental::spawn( policy , FibChild<ExecSpace>(policy,n) ); + + Kokkos::Experimental::wait( policy ); + + if ( f.get() != eval_fib(n) ) { + std::cout << "Fib(" << n << ") = " << f.get(); + std::cout << " != " << eval_fib(n); + std::cout << std::endl ; + } +} + +template< class ExecSpace > +void test_fib2( long n ) +{ + Kokkos::Experimental::TaskPolicy<ExecSpace> policy(2); // default dependence capacity + + Kokkos::Experimental::Future<long,ExecSpace> f = Kokkos::Experimental::spawn( policy , FibChild2<ExecSpace>(policy,n) ); + + Kokkos::Experimental::wait( policy ); + + if ( f.get() != eval_fib(n) ) { + std::cout << "Fib2(" << n << ") = " << f.get(); + std::cout << " != " << eval_fib(n); + std::cout << std::endl ; + } +} + +//---------------------------------------------------------------------------- + +template< class ExecSpace > +struct Norm2 { + + typedef double value_type ; + + const double * const m_x ; + + Norm2( const double * x ) : m_x(x) {} + + inline + void init( double & val ) const { val = 0 ; } + + inline + void operator()( int i , double & val ) const { val += m_x[i] * m_x[i] ; } + + void apply( double & dst ) const { dst = std::sqrt( dst ); } +}; + +template< class ExecSpace > +void test_norm2( const int n ) +{ + Kokkos::Experimental::TaskPolicy< ExecSpace > policy ; + + double * const x = new double[n]; + + for ( int i = 0 ; i < n ; ++i ) x[i] = 1 ; + + Kokkos::RangePolicy<ExecSpace> r(0,n); + + Kokkos::Experimental::Future<double,ExecSpace> f = Kokkos::Experimental::spawn_reduce( policy , r , Norm2<ExecSpace>(x) ); + + Kokkos::Experimental::wait( policy ); + +#if defined(PRINT) + std::cout << "Norm2: " << f.get() << std::endl ; +#endif + + delete[] x ; +} + +//---------------------------------------------------------------------------- + +template< class Space > +struct TaskDep { + + typedef int value_type ; + typedef Kokkos::Experimental::TaskPolicy< Space > policy_type ; + + const policy_type policy ; + const int input ; + + TaskDep( const policy_type & arg_p , const int arg_i ) + : policy( arg_p ), input( arg_i ) {} + + void apply( int & val ) + { + val = input ; + const int num = policy.get_dependence( this ); + + for ( int i = 0 ; i < num ; ++i ) { + Kokkos::Experimental::Future<int,Space> f = policy.get_dependence( this , i ); + val += f.get(); + } + } +}; + + +template< class Space > +void test_task_dep( const int n ) +{ + enum { NTEST = 64 }; + + Kokkos::Experimental::TaskPolicy< Space > policy ; + + Kokkos::Experimental::Future<int,Space> f[ NTEST ]; + + for ( int i = 0 ; i < NTEST ; ++i ) { + // Create task in the "constructing" state with capacity for 'n+1' dependences + f[i] = policy.create( TaskDep<Space>(policy,0) , n + 1 ); + + if ( f[i].get_task_state() != Kokkos::Experimental::TASK_STATE_CONSTRUCTING ) { + Kokkos::Impl::throw_runtime_exception("get_task_state() != Kokkos::Experimental::TASK_STATE_CONSTRUCTING"); + } + + // Only use 'n' dependences + + for ( int j = 0 ; j < n ; ++j ) { + + Kokkos::Experimental::Future<int,Space> nested = policy.create( TaskDep<Space>(policy,j+1) ); + + policy.spawn( nested ); + + // Add dependence to a "constructing" task + policy.add_dependence( f[i] , nested ); + } + + // Spawn task from the "constructing" to the "waiting" state + policy.spawn( f[i] ); + } + + const int answer = n % 2 ? n * ( ( n + 1 ) / 2 ) : ( n / 2 ) * ( n + 1 ); + + Kokkos::Experimental::wait( policy ); + + int error = 0 ; + for ( int i = 0 ; i < NTEST ; ++i ) { + if ( f[i].get_task_state() != Kokkos::Experimental::TASK_STATE_COMPLETE ) { + Kokkos::Impl::throw_runtime_exception("get_task_state() != Kokkos::Experimental::TASK_STATE_COMPLETE"); + } + if ( answer != f[i].get() && 0 == error ) { + std::cout << "test_task_dep(" << n << ") ERROR at[" << i << "]" + << " answer(" << answer << ") != result(" << f[i].get() << ")" << std::endl ; + } + } +} + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_HAVE_CXX11 ) + +template< class ExecSpace > +struct TaskTeam { + + enum { SPAN = 8 }; + + typedef void value_type ; + typedef Kokkos::Experimental::TaskPolicy<ExecSpace> policy_type ; + typedef Kokkos::Experimental::Future<ExecSpace> future_type ; + typedef Kokkos::View<long*,ExecSpace> view_type ; + + policy_type policy ; + future_type future ; + + view_type result ; + const long nvalue ; + + TaskTeam( const policy_type & arg_policy + , const view_type & arg_result + , const long arg_nvalue ) + : policy(arg_policy) + , future() + , result( arg_result ) + , nvalue( arg_nvalue ) + {} + + inline + void apply( const typename policy_type::member_type & member ) + { + const long end = nvalue + 1 ; + const long begin = 0 < end - SPAN ? end - SPAN : 0 ; + + if ( 0 < begin && future.get_task_state() == Kokkos::Experimental::TASK_STATE_NULL ) { + if ( member.team_rank() == 0 ) { + future = policy.spawn( policy.create_team( TaskTeam( policy , result , begin - 1 ) ) ); + policy.clear_dependence( this ); + policy.add_dependence( this , future ); + policy.respawn( this ); + } + return ; + } + + Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end) + , [&]( int i ) { result[i] = i + 1 ; } + ); + } +}; + +template< class ExecSpace > +struct TaskTeamValue { + + enum { SPAN = 8 }; + + typedef long value_type ; + typedef Kokkos::Experimental::TaskPolicy<ExecSpace> policy_type ; + typedef Kokkos::Experimental::Future<value_type,ExecSpace> future_type ; + typedef Kokkos::View<long*,ExecSpace> view_type ; + + policy_type policy ; + future_type future ; + + view_type result ; + const long nvalue ; + + TaskTeamValue( const policy_type & arg_policy + , const view_type & arg_result + , const long arg_nvalue ) + : policy(arg_policy) + , future() + , result( arg_result ) + , nvalue( arg_nvalue ) + {} + + inline + void apply( const typename policy_type::member_type & member , value_type & final ) + { + const long end = nvalue + 1 ; + const long begin = 0 < end - SPAN ? end - SPAN : 0 ; + + if ( 0 < begin && future.get_task_state() == Kokkos::Experimental::TASK_STATE_NULL ) { + if ( member.team_rank() == 0 ) { + future = policy.spawn( policy.create_team( TaskTeamValue( policy , result , begin - 1 ) ) ); + policy.clear_dependence( this ); + policy.add_dependence( this , future ); + policy.respawn( this ); + } + return ; + } + + Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end) + , [&]( int i ) { result[i] = i + 1 ; } + ); + + if ( member.team_rank() == 0 ) { + final = result[nvalue] ; + } + + Kokkos::memory_fence(); + } +}; + +template< class ExecSpace > +void test_task_team( long n ) +{ + typedef TaskTeam< ExecSpace > task_type ; + typedef TaskTeamValue< ExecSpace > task_value_type ; + typedef typename task_type::view_type view_type ; + typedef typename task_type::policy_type policy_type ; + + typedef typename task_type::future_type future_type ; + typedef typename task_value_type::future_type future_value_type ; + + policy_type policy ; + view_type result("result",n+1); + + future_type f = policy.spawn( policy.create_team( task_type( policy , result , n ) ) ); + + Kokkos::Experimental::wait( policy ); + + for ( long i = 0 ; i <= n ; ++i ) { + const long answer = i + 1 ; + if ( result(i) != answer ) { + std::cerr << "test_task_team void ERROR result(" << i << ") = " << result(i) << " != " << answer << std::endl ; + } + } + + future_value_type fv = policy.spawn( policy.create_team( task_value_type( policy , result , n ) ) ); + + Kokkos::Experimental::wait( policy ); + + if ( fv.get() != n + 1 ) { + std::cerr << "test_task_team value ERROR future = " << fv.get() << " != " << n + 1 << std::endl ; + } + for ( long i = 0 ; i <= n ; ++i ) { + const long answer = i + 1 ; + if ( result(i) != answer ) { + std::cerr << "test_task_team value ERROR result(" << i << ") = " << result(i) << " != " << answer << std::endl ; + } + } +} + +#endif + +//---------------------------------------------------------------------------- + +} // namespace TestTaskPolicy + +#endif /* #ifndef KOKKOS_UNITTEST_TASKPOLICY_HPP */ + + diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp new file mode 100755 index 0000000000000000000000000000000000000000..4849f18dfbac209252d5d2ddde8e0d8dfc98ac7d --- /dev/null +++ b/lib/kokkos/core/unit_test/TestTeam.hpp @@ -0,0 +1,466 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdio.h> +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Test { +namespace { + +template< class ExecSpace > +struct TestTeamPolicy { + + typedef typename Kokkos::TeamPolicy< ExecSpace >::member_type team_member ; + typedef Kokkos::View<int**,ExecSpace> view_type ; + + view_type m_flags ; + + TestTeamPolicy( const size_t league_size ) + : m_flags( Kokkos::ViewAllocateWithoutInitializing("flags") + , Kokkos::TeamPolicy< ExecSpace >::team_size_max( *this ) + , league_size ) + {} + + struct VerifyInitTag {}; + + KOKKOS_INLINE_FUNCTION + void operator()( const team_member & member ) const + { + const int tid = member.team_rank() + member.team_size() * member.league_rank(); + + m_flags( member.team_rank() , member.league_rank() ) = tid ; + } + + KOKKOS_INLINE_FUNCTION + void operator()( const VerifyInitTag & , const team_member & member ) const + { + const int tid = member.team_rank() + member.team_size() * member.league_rank(); + + if ( tid != m_flags( member.team_rank() , member.league_rank() ) ) { + printf("TestTeamPolicy member(%d,%d) error %d != %d\n" + , member.league_rank() , member.team_rank() + , tid , m_flags( member.team_rank() , member.league_rank() ) ); + } + } + + static void test_for( const size_t league_size ) + { + TestTeamPolicy functor( league_size ); + + const int team_size = Kokkos::TeamPolicy< ExecSpace >::team_size_max( functor ); + + Kokkos::parallel_for( Kokkos::TeamPolicy< ExecSpace >( league_size , team_size ) , functor ); + Kokkos::parallel_for( Kokkos::TeamPolicy< ExecSpace , VerifyInitTag >( league_size , team_size ) , functor ); + } + + struct ReduceTag {}; + + typedef long value_type ; + + KOKKOS_INLINE_FUNCTION + void operator()( const team_member & member , value_type & update ) const + { + update += member.team_rank() + member.team_size() * member.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const ReduceTag & , const team_member & member , value_type & update ) const + { + update += 1 + member.team_rank() + member.team_size() * member.league_rank(); + } + + static void test_reduce( const size_t league_size ) + { + TestTeamPolicy functor( league_size ); + + const int team_size = Kokkos::TeamPolicy< ExecSpace >::team_size_max( functor ); + const long N = team_size * league_size ; + + long total = 0 ; + + Kokkos::parallel_reduce( Kokkos::TeamPolicy< ExecSpace >( league_size , team_size ) , functor , total ); + ASSERT_EQ( size_t((N-1)*(N))/2 , size_t(total) ); + + Kokkos::parallel_reduce( Kokkos::TeamPolicy< ExecSpace , ReduceTag >( league_size , team_size ) , functor , total ); + ASSERT_EQ( (size_t(N)*size_t(N+1))/2 , size_t(total) ); + } +}; + +} +} + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +template< typename ScalarType , class DeviceType > +class ReduceTeamFunctor +{ +public: + typedef DeviceType execution_space ; + typedef Kokkos::TeamPolicy< execution_space > policy_type ; + typedef typename execution_space::size_type size_type ; + + struct value_type { + ScalarType value[3] ; + }; + + const size_type nwork ; + + ReduceTeamFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {} + + ReduceTeamFunctor( const ReduceTeamFunctor & rhs ) + : nwork( rhs.nwork ) {} + + KOKKOS_INLINE_FUNCTION + void init( value_type & dst ) const + { + dst.value[0] = 0 ; + dst.value[1] = 0 ; + dst.value[2] = 0 ; + } + + KOKKOS_INLINE_FUNCTION + void join( volatile value_type & dst , + const volatile value_type & src ) const + { + dst.value[0] += src.value[0] ; + dst.value[1] += src.value[1] ; + dst.value[2] += src.value[2] ; + } + + KOKKOS_INLINE_FUNCTION + void operator()( const typename policy_type::member_type ind , value_type & dst ) const + { + const int thread_rank = ind.team_rank() + ind.team_size() * ind.league_rank(); + const int thread_size = ind.team_size() * ind.league_size(); + const int chunk = ( nwork + thread_size - 1 ) / thread_size ; + + size_type iwork = chunk * thread_rank ; + const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork ; + + for ( ; iwork < iwork_end ; ++iwork ) { + dst.value[0] += 1 ; + dst.value[1] += iwork + 1 ; + dst.value[2] += nwork - iwork ; + } + } +}; + +} // namespace Test + +namespace { + +template< typename ScalarType , class DeviceType > +class TestReduceTeam +{ +public: + typedef DeviceType execution_space ; + typedef Kokkos::TeamPolicy< execution_space > policy_type ; + typedef typename execution_space::size_type size_type ; + + //------------------------------------ + + TestReduceTeam( const size_type & nwork ) + { + run_test(nwork); + } + + void run_test( const size_type & nwork ) + { + typedef Test::ReduceTeamFunctor< ScalarType , execution_space > functor_type ; + typedef typename functor_type::value_type value_type ; + typedef Kokkos::View< value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type ; + + enum { Count = 3 }; + enum { Repeat = 100 }; + + value_type result[ Repeat ]; + + const unsigned long nw = nwork ; + const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 ) + : (nw/2) * ( nw + 1 ); + + const unsigned team_size = policy_type::team_size_recommended( functor_type(nwork) ); + const unsigned league_size = ( nwork + team_size - 1 ) / team_size ; + + policy_type team_exec( league_size , team_size ); + + for ( unsigned i = 0 ; i < Repeat ; ++i ) { + result_type tmp( & result[i] ); + Kokkos::parallel_reduce( team_exec , functor_type(nwork) , tmp ); + } + + execution_space::fence(); + + for ( unsigned i = 0 ; i < Repeat ; ++i ) { + for ( unsigned j = 0 ; j < Count ; ++j ) { + const unsigned long correct = 0 == j % 3 ? nw : nsum ; + ASSERT_EQ( (ScalarType) correct , result[i].value[j] ); + } + } + } +}; + +} + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +template< class DeviceType > +class ScanTeamFunctor +{ +public: + typedef DeviceType execution_space ; + typedef Kokkos::TeamPolicy< execution_space > policy_type ; + + typedef long int value_type ; + Kokkos::View< value_type , execution_space > accum ; + Kokkos::View< value_type , execution_space > total ; + + ScanTeamFunctor() : accum("accum"), total("total") {} + + KOKKOS_INLINE_FUNCTION + void init( value_type & error ) const { error = 0 ; } + + KOKKOS_INLINE_FUNCTION + void join( value_type volatile & error , + value_type volatile const & input ) const + { if ( input ) error = 1 ; } + + struct JoinMax { + typedef long int value_type ; + KOKKOS_INLINE_FUNCTION + void join( value_type volatile & dst + , value_type volatile const & input ) const + { if ( dst < input ) dst = input ; } + }; + + KOKKOS_INLINE_FUNCTION + void operator()( const typename policy_type::member_type ind , value_type & error ) const + { + if ( 0 == ind.league_rank() && 0 == ind.team_rank() ) { + const long int thread_count = ind.league_size() * ind.team_size(); + total() = ( thread_count * ( thread_count + 1 ) ) / 2 ; + } + + // Team max: + const int long m = ind.team_reduce( (long int) ( ind.league_rank() + ind.team_rank() ) , JoinMax() ); + + if ( m != ind.league_rank() + ( ind.team_size() - 1 ) ) { + printf("ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n" + , ind.league_rank(), ind.team_rank() + , ind.league_size(), ind.team_size() + , (long int)(ind.league_rank() + ( ind.team_size() - 1 )) , m ); + } + + // Scan: + const long int answer = + ( ind.league_rank() + 1 ) * ind.team_rank() + + ( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2 ; + + const long int result = + ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 ); + + const long int result2 = + ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 ); + + if ( answer != result || answer != result2 ) { + printf("ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n", + ind.league_rank(), ind.team_rank(), + ind.league_size(), ind.team_size(), + answer,result,result2); + error = 1 ; + } + + const long int thread_rank = ind.team_rank() + + ind.team_size() * ind.league_rank(); + ind.team_scan( 1 + thread_rank , accum.ptr_on_device() ); + } +}; + +template< class DeviceType > +class TestScanTeam +{ +public: + typedef DeviceType execution_space ; + typedef long int value_type ; + + typedef Kokkos::TeamPolicy< execution_space > policy_type ; + typedef Test::ScanTeamFunctor<DeviceType> functor_type ; + + //------------------------------------ + + TestScanTeam( const size_t nteam ) + { + run_test(nteam); + } + + void run_test( const size_t nteam ) + { + typedef Kokkos::View< long int , Kokkos::HostSpace , Kokkos::MemoryUnmanaged > result_type ; + + const unsigned REPEAT = 100000 ; + const unsigned Repeat = ( REPEAT + nteam - 1 ) / nteam ; + + functor_type functor ; + + policy_type team_exec( nteam , policy_type::team_size_max( functor ) ); + + for ( unsigned i = 0 ; i < Repeat ; ++i ) { + long int accum = 0 ; + long int total = 0 ; + long int error = 0 ; + Kokkos::deep_copy( functor.accum , total ); + Kokkos::parallel_reduce( team_exec , functor , result_type( & error ) ); + DeviceType::fence(); + Kokkos::deep_copy( accum , functor.accum ); + Kokkos::deep_copy( total , functor.total ); + + ASSERT_EQ( error , 0 ); + ASSERT_EQ( total , accum ); + } + + execution_space::fence(); + } +}; + +} // namespace Test + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +template< class ExecSpace > +struct SharedTeamFunctor { + + typedef ExecSpace execution_space ; + typedef int value_type ; + typedef Kokkos::TeamPolicy< execution_space > policy_type ; + + enum { SHARED_COUNT = 1000 }; + + typedef typename ExecSpace::scratch_memory_space shmem_space ; + + // tbd: MemoryUnmanaged should be the default for shared memory space + typedef Kokkos::View<int*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ; + + // Tell how much shared memory will be required by this functor: + inline + unsigned team_shmem_size( int /* team_size */ ) const + { + return shared_int_array_type::shmem_size( SHARED_COUNT ) + + shared_int_array_type::shmem_size( SHARED_COUNT ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const typename policy_type::member_type & ind , value_type & update ) const + { + const shared_int_array_type shared_A( ind.team_shmem() , SHARED_COUNT ); + const shared_int_array_type shared_B( ind.team_shmem() , SHARED_COUNT ); + + if ((shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0) || + (shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0)) { + printf ("Failed to allocate shared memory of size %lu\n", + static_cast<unsigned long> (SHARED_COUNT)); + ++update; // failure to allocate is an error + } + else { + for ( int i = ind.team_rank() ; i < SHARED_COUNT ; i += ind.team_size() ) { + shared_A[i] = i + ind.league_rank(); + shared_B[i] = 2 * i + ind.league_rank(); + } + + ind.team_barrier(); + + if ( ind.team_rank() + 1 == ind.team_size() ) { + for ( int i = 0 ; i < SHARED_COUNT ; ++i ) { + if ( shared_A[i] != i + ind.league_rank() ) { + ++update ; + } + if ( shared_B[i] != 2 * i + ind.league_rank() ) { + ++update ; + } + } + } + } + } +}; + +} + +namespace { + +template< class ExecSpace > +struct TestSharedTeam { + + TestSharedTeam() + { run(); } + + void run() + { + typedef Test::SharedTeamFunctor<ExecSpace> Functor ; + typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged > result_type ; + + const size_t team_size = Kokkos::TeamPolicy< ExecSpace >::team_size_max( Functor() ); + + Kokkos::TeamPolicy< ExecSpace > team_exec( 8192 / team_size , team_size ); + + typename Functor::value_type error_count = 0 ; + + Kokkos::parallel_reduce( team_exec , Functor() , result_type( & error_count ) ); + + ASSERT_EQ( error_count , 0 ); + } +}; + +} + +/*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp new file mode 100755 index 0000000000000000000000000000000000000000..add8b7ed4578a40b964f688f3ef02d93fb1a1cc5 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp @@ -0,0 +1,650 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#include <impl/Kokkos_Timer.hpp> +#include <iostream> +#include <cstdlib> + +namespace TestTeamVector { + +struct my_complex { + double re,im; + int dummy; + KOKKOS_INLINE_FUNCTION + my_complex() { + re = 0.0; + im = 0.0; + dummy = 0; + } + KOKKOS_INLINE_FUNCTION + my_complex(const my_complex& src) { + re = src.re; + im = src.im; + dummy = src.dummy; + } + + KOKKOS_INLINE_FUNCTION + my_complex(const volatile my_complex& src) { + re = src.re; + im = src.im; + dummy = src.dummy; + } + + KOKKOS_INLINE_FUNCTION + my_complex(const double& val) { + re = val; + im = 0.0; + dummy = 0; + } + KOKKOS_INLINE_FUNCTION + my_complex& operator += (const my_complex& src) { + re += src.re; + im += src.im; + dummy += src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + void operator += (const volatile my_complex& src) volatile { + re += src.re; + im += src.im; + dummy += src.dummy; + } + KOKKOS_INLINE_FUNCTION + my_complex& operator *= (const my_complex& src) { + double re_tmp = re*src.re - im*src.im; + double im_tmp = re * src.im + im * src.re; + re = re_tmp; + im = im_tmp; + dummy *= src.dummy; + return *this; + } + KOKKOS_INLINE_FUNCTION + void operator *= (const volatile my_complex& src) volatile { + double re_tmp = re*src.re - im*src.im; + double im_tmp = re * src.im + im * src.re; + re = re_tmp; + im = im_tmp; + dummy *= src.dummy; + } + KOKKOS_INLINE_FUNCTION + bool operator == (const my_complex& src) { + return (re == src.re) && (im == src.im) && ( dummy == src.dummy ); + } + KOKKOS_INLINE_FUNCTION + bool operator != (const my_complex& src) { + return (re != src.re) || (im != src.im) || ( dummy != src.dummy ); + } + KOKKOS_INLINE_FUNCTION + bool operator != (const double& val) { + return (re != val) || + (im != 0) || (dummy != 0); + } + KOKKOS_INLINE_FUNCTION + my_complex& operator= (const int& val) { + re = val; + im = 0.0; + dummy = 0; + return *this; + } + KOKKOS_INLINE_FUNCTION + my_complex& operator= (const double& val) { + re = val; + im = 0.0; + dummy = 0; + return *this; + } + KOKKOS_INLINE_FUNCTION + operator double() { + return re; + } +}; + +#if defined (KOKKOS_HAVE_CXX11) + + +template<typename Scalar, class ExecutionSpace> +struct functor_team_for { + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_team_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team) const { + + typedef typename ExecutionSpace::scratch_memory_space shmem_space ; + typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int; + typedef typename shared_int::size_type size_type; + + const size_type shmemSize = team.team_size () * 13; + shared_int values = shared_int (team.team_shmem (), shmemSize); + + if (values.ptr_on_device () == NULL || values.dimension_0 () < shmemSize) { + printf ("FAILED to allocate shared memory of size %u\n", + static_cast<unsigned int> (shmemSize)); + } + else { + + // Initialize shared memory + values(team.team_rank ()) = 0; + + // Accumulate value into per thread shared memory + // This is non blocking + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,131),[&] (int i) { + values(team.team_rank ()) += i - team.league_rank () + team.league_size () + team.team_size (); + }); + // Wait for all memory to be written + team.team_barrier (); + // One thread per team executes the comparison + Kokkos::single(Kokkos::PerTeam(team),[&]() { + Scalar test = 0; + Scalar value = 0; + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank () + team.league_size () + team.team_size (); + } + for (int i = 0; i < team.team_size (); ++i) { + value += values(i); + } + if (test != value) { + printf ("FAILED team_parallel_for %i %i %f %f\n", + team.league_rank (), team.team_rank (), + static_cast<double> (test), static_cast<double> (value)); + flag() = 1; + } + }); + } + } +}; + +template<typename Scalar, class ExecutionSpace> +struct functor_team_reduce { + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_team_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team) const { + + Scalar value = Scalar(); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131),[&] (int i, Scalar& val) { + val += i - team.league_rank () + team.league_size () + team.team_size (); + },value); + + team.team_barrier (); + Kokkos::single(Kokkos::PerTeam(team),[&]() { + Scalar test = 0; + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank () + team.league_size () + team.team_size (); + } + if (test != value) { + if(team.league_rank() == 0) + printf ("FAILED team_parallel_reduce %i %i %f %f %lu\n", + team.league_rank (), team.team_rank (), + static_cast<double> (test), static_cast<double> (value),sizeof(Scalar)); + flag() = 1; + } + }); + } +}; + +template<typename Scalar, class ExecutionSpace> +struct functor_team_reduce_join { + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_team_reduce_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team) const { + + Scalar value = 0; + + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131) + , [&] (int i, Scalar& val) { + val += i - team.league_rank () + team.league_size () + team.team_size (); + } + , [&] (volatile Scalar& val, const volatile Scalar& src) {val+=src;} + , value + ); + + team.team_barrier (); + Kokkos::single(Kokkos::PerTeam(team),[&]() { + Scalar test = 0; + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank () + team.league_size () + team.team_size (); + } + if (test != value) { + printf ("FAILED team_vector_parallel_reduce_join %i %i %f %f\n", + team.league_rank (), team.team_rank (), + static_cast<double> (test), static_cast<double> (value)); + flag() = 1; + } + }); + } +}; + +template<typename Scalar, class ExecutionSpace> +struct functor_team_vector_for { + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_team_vector_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team) const { + + typedef typename ExecutionSpace::scratch_memory_space shmem_space ; + typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int; + typedef typename shared_int::size_type size_type; + + const size_type shmemSize = team.team_size () * 13; + shared_int values = shared_int (team.team_shmem (), shmemSize); + + if (values.ptr_on_device () == NULL || values.dimension_0 () < shmemSize) { + printf ("FAILED to allocate shared memory of size %u\n", + static_cast<unsigned int> (shmemSize)); + } + else { + Kokkos::single(Kokkos::PerThread(team),[&] () { + values(team.team_rank ()) = 0; + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,131),[&] (int i) { + Kokkos::single(Kokkos::PerThread(team),[&] () { + values(team.team_rank ()) += i - team.league_rank () + team.league_size () + team.team_size (); + }); + }); + + team.team_barrier (); + Kokkos::single(Kokkos::PerTeam(team),[&]() { + Scalar test = 0; + Scalar value = 0; + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank () + team.league_size () + team.team_size (); + } + for (int i = 0; i < team.team_size (); ++i) { + value += values(i); + } + if (test != value) { + printf ("FAILED team_vector_parallel_for %i %i %f %f\n", + team.league_rank (), team.team_rank (), + static_cast<double> (test), static_cast<double> (value)); + flag() = 1; + } + }); + } + } +}; + +template<typename Scalar, class ExecutionSpace> +struct functor_team_vector_reduce { + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_team_vector_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team) const { + + Scalar value = Scalar(); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131),[&] (int i, Scalar& val) { + val += i - team.league_rank () + team.league_size () + team.team_size (); + },value); + + team.team_barrier (); + Kokkos::single(Kokkos::PerTeam(team),[&]() { + Scalar test = 0; + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank () + team.league_size () + team.team_size (); + } + if (test != value) { + if(team.league_rank() == 0) + printf ("FAILED team_vector_parallel_reduce %i %i %f %f %lu\n", + team.league_rank (), team.team_rank (), + static_cast<double> (test), static_cast<double> (value),sizeof(Scalar)); + flag() = 1; + } + }); + } +}; + +template<typename Scalar, class ExecutionSpace> +struct functor_team_vector_reduce_join { + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_team_vector_reduce_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team) const { + + Scalar value = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131) + , [&] (int i, Scalar& val) { + val += i - team.league_rank () + team.league_size () + team.team_size (); + } + , [&] (volatile Scalar& val, const volatile Scalar& src) {val+=src;} + , value + ); + + team.team_barrier (); + Kokkos::single(Kokkos::PerTeam(team),[&]() { + Scalar test = 0; + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank () + team.league_size () + team.team_size (); + } + if (test != value) { + printf ("FAILED team_vector_parallel_reduce_join %i %i %f %f\n", + team.league_rank (), team.team_rank (), + static_cast<double> (test), static_cast<double> (value)); + flag() = 1; + } + }); + } +}; + +template<typename Scalar, class ExecutionSpace> +struct functor_vec_single { + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_vec_single(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team) const { + + // Warning: this test case intentionally violates permissable semantics + // It is not valid to get references to members of the enclosing region + // inside a parallel_for and write to it. + Scalar value = 0; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,13),[&] (int i) { + value = i; // This write is violating Kokkos semantics for nested parallelism + }); + + Kokkos::single(Kokkos::PerThread(team),[&] (Scalar& val) { + val = 1; + },value); + + Scalar value2 = 0; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13), [&] (int i, Scalar& val) { + val += value; + },value2); + + if(value2!=(value*13)) { + printf("FAILED vector_single broadcast %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) value2,(double) value); + flag()=1; + } + } +}; + +template<typename Scalar, class ExecutionSpace> +struct functor_vec_for { + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_vec_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team) const { + + typedef typename ExecutionSpace::scratch_memory_space shmem_space ; + typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int; + shared_int values = shared_int(team.team_shmem(),team.team_size()*13); + + if (values.ptr_on_device () == NULL || + values.dimension_0() < (unsigned) team.team_size() * 13) { + printf ("FAILED to allocate memory of size %i\n", + static_cast<int> (team.team_size () * 13)); + flag() = 1; + } + else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,13), [&] (int i) { + values(13*team.team_rank() + i) = i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size(); + }); + + Kokkos::single(Kokkos::PerThread(team),[&] () { + Scalar test = 0; + Scalar value = 0; + for (int i = 0; i < 13; ++i) { + test += i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size(); + value += values(13*team.team_rank() + i); + } + if (test != value) { + printf ("FAILED vector_par_for %i %i %f %f\n", + team.league_rank (), team.team_rank (), + static_cast<double> (test), static_cast<double> (value)); + flag() = 1; + } + }); + } + } +}; + +template<typename Scalar, class ExecutionSpace> +struct functor_vec_red { + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_vec_red(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team) const { + Scalar value = 0; + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13),[&] (int i, Scalar& val) { + val += i; + }, value); + + Kokkos::single(Kokkos::PerThread(team),[&] () { + Scalar test = 0; + for(int i = 0; i < 13; i++) { + test+=i; + } + if(test!=value) { + printf("FAILED vector_par_reduce %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) value); + flag()=1; + } + }); + } +}; + +template<typename Scalar, class ExecutionSpace> +struct functor_vec_red_join { + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_vec_red_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team) const { + Scalar value = 1; + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13) + , [&] (int i, Scalar& val) { val *= i; } + , [&] (Scalar& val, const Scalar& src) {val*=src;} + , value + ); + + Kokkos::single(Kokkos::PerThread(team),[&] () { + Scalar test = 1; + for(int i = 0; i < 13; i++) { + test*=i; + } + if(test!=value) { + printf("FAILED vector_par_reduce_join %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) value); + flag()=1; + } + }); + } +}; + +template<typename Scalar, class ExecutionSpace> +struct functor_vec_scan { + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_vec_scan(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team) const { + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,13),[&] (int i, Scalar& val, bool final) { + val += i; + if(final) { + Scalar test = 0; + for(int k = 0; k <= i; k++) { + test+=k; + } + if(test!=val) { + printf("FAILED vector_par_scan %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) val); + flag()=1; + } + } + }); + } +}; + +template<typename Scalar, class ExecutionSpace> +struct functor_reduce { + typedef double value_type; + typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type; + typedef ExecutionSpace execution_space; + + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag; + functor_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (typename policy_type::member_type team, double& sum) const { + sum += team.league_rank() * 100 + team.thread_rank(); + } +}; +#endif + +template<typename Scalar,class ExecutionSpace> +bool test_scalar(int nteams, int team_size, int test) { + Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> d_flag("flag"); + typename Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace>::HostMirror h_flag("h_flag"); + h_flag() = 0 ; + Kokkos::deep_copy(d_flag,h_flag); + #ifdef KOKKOS_HAVE_CXX11 + if(test==0) + Kokkos::parallel_for( std::string("A") , Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8), + functor_vec_red<Scalar, ExecutionSpace>(d_flag)); + if(test==1) + Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8), + functor_vec_red_join<Scalar, ExecutionSpace>(d_flag)); + if(test==2) + Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8), + functor_vec_scan<Scalar, ExecutionSpace>(d_flag)); + if(test==3) + Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8), + functor_vec_for<Scalar, ExecutionSpace>(d_flag)); + if(test==4) + Kokkos::parallel_for( "B" , Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8), + functor_vec_single<Scalar, ExecutionSpace>(d_flag)); + if(test==5) + Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size), + functor_team_for<Scalar, ExecutionSpace>(d_flag)); + if(test==6) + Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size), + functor_team_reduce<Scalar, ExecutionSpace>(d_flag)); + if(test==7) + Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size), + functor_team_reduce_join<Scalar, ExecutionSpace>(d_flag)); + if(test==8) + Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8), + functor_team_vector_for<Scalar, ExecutionSpace>(d_flag)); + if(test==9) + Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8), + functor_team_vector_reduce<Scalar, ExecutionSpace>(d_flag)); + if(test==10) + Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8), + functor_team_vector_reduce_join<Scalar, ExecutionSpace>(d_flag)); + #endif + Kokkos::deep_copy(h_flag,d_flag); + + return (h_flag() == 0); +} + +template<class ExecutionSpace> +bool Test(int test) { + bool passed = true; + passed = passed && test_scalar<int, ExecutionSpace>(317,33,test); + passed = passed && test_scalar<long long int, ExecutionSpace>(317,33,test); + passed = passed && test_scalar<float, ExecutionSpace>(317,33,test); + passed = passed && test_scalar<double, ExecutionSpace>(317,33,test); + passed = passed && test_scalar<my_complex, ExecutionSpace>(317,33,test); + return passed; +} + +} + diff --git a/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp new file mode 100755 index 0000000000000000000000000000000000000000..4f136bc64b977e3243b9aaf789d4837e7e5ca793 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp @@ -0,0 +1,219 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#define KOKKOS_PRAGMA_UNROLL(a) + +namespace { + +template<class Scalar, class ExecutionSpace> +struct SumPlain { + typedef ExecutionSpace execution_space; + typedef typename Kokkos::View<Scalar*,execution_space> type; + type view; + SumPlain(type view_):view(view_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (int i, Scalar& val) { + val += Scalar(); + } +}; + +template<class Scalar, class ExecutionSpace> +struct SumInitJoinFinalValueType { + typedef ExecutionSpace execution_space; + typedef typename Kokkos::View<Scalar*,execution_space> type; + type view; + typedef Scalar value_type; + SumInitJoinFinalValueType(type view_):view(view_) {} + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val = value_type(); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& val, volatile value_type& src) const { + val += src; + } + + KOKKOS_INLINE_FUNCTION + void operator() (int i, value_type& val) const { + val += value_type(); + } + +}; + +template<class Scalar, class ExecutionSpace> +struct SumInitJoinFinalValueType2 { + typedef ExecutionSpace execution_space; + typedef typename Kokkos::View<Scalar*,execution_space> type; + type view; + typedef Scalar value_type; + SumInitJoinFinalValueType2(type view_):view(view_) {} + + KOKKOS_INLINE_FUNCTION + void init(volatile value_type& val) const { + val = value_type(); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& val, const volatile value_type& src) const { + val += src; + } + + KOKKOS_INLINE_FUNCTION + void operator() (int i, value_type& val) const { + val += value_type(); + } + +}; + +template<class Scalar, class ExecutionSpace> +struct SumInitJoinFinalValueTypeArray { + typedef ExecutionSpace execution_space; + typedef typename Kokkos::View<Scalar*,execution_space> type; + type view; + typedef Scalar value_type[]; + int n; + SumInitJoinFinalValueTypeArray(type view_, int n_):view(view_),n(n_) {} + + KOKKOS_INLINE_FUNCTION + void init(value_type val) const { + for(int k=0;k<n;k++) + val[k] = 0; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type val, const volatile value_type src) const { + for(int k=0;k<n;k++) + val[k] += src[k]; + } + + KOKKOS_INLINE_FUNCTION + void operator() (int i, value_type val) const { + for(int k=0;k<n;k++) + val[k] += k*i; + } + +}; + +template<class Scalar, class ExecutionSpace> +struct SumWrongInitJoinFinalValueType { + typedef ExecutionSpace execution_space; + typedef typename Kokkos::View<Scalar*,execution_space> type; + type view; + typedef Scalar value_type; + SumWrongInitJoinFinalValueType(type view_):view(view_) {} + + KOKKOS_INLINE_FUNCTION + void init(double& val) const { + val = double(); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& val, const value_type& src) const { + val += src; + } + + KOKKOS_INLINE_FUNCTION + void operator() (int i, value_type& val) const { + val += value_type(); + } + +}; + +template<class Scalar, class ExecutionSpace> +void TestTemplateMetaFunctions() { + typedef typename Kokkos::View<Scalar*,ExecutionSpace> type; + type a("A",100); +/* #ifdef KOKKOS_HAVE_CXX11 + int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit<SumPlain<Scalar,ExecutionSpace>, Scalar& >::value; + ASSERT_EQ(sum_plain_has_init_arg,0); + int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value; + ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg,1); + int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value; + ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg2,1); + int sum_wronginitjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value; + ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_init_arg,0); + + //int sum_initjoinfinalvaluetypearray_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueTypeArray<Scalar,ExecutionSpace>, Scalar[] >::value; + //ASSERT_EQ(sum_initjoinfinalvaluetypearray_has_init_arg,1); + + #else + + int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit<SumPlain<Scalar,ExecutionSpace>, Scalar& >::value; + ASSERT_EQ(sum_plain_has_init_arg,0); + int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value; + ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg,1); + int sum_wronginitjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value; + ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_init_arg,1); + + #endif + + //printf("Values Init: %i %i %i\n",sum_plain_has_init_arg,sum_initjoinfinalvaluetype_has_init_arg,sum_wronginitjoinfinalvaluetype_has_init_arg); + +#ifdef KOKKOS_HAVE_CXX11 + int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumPlain<Scalar,ExecutionSpace>, Scalar >::value; + ASSERT_EQ(sum_plain_has_join_arg,0); + int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value; + ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg,1); + int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value; + ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg2,1); + int sum_wronginitjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value; + ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_join_arg,0); +#else + int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumPlain<Scalar,ExecutionSpace>, Scalar& >::value; + ASSERT_EQ(sum_plain_has_join_arg,0); + int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value; + ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg,1); + int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar& >::value; + ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg2,1); + int sum_wronginitjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value; + ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_join_arg,1); +#endif*/ + //printf("Values Join: %i %i %i\n",sum_plain_has_join_arg,sum_initjoinfinalvaluetype_has_join_arg,sum_wronginitjoinfinalvaluetype_has_join_arg); +} + +} diff --git a/lib/kokkos/core/unit_test/TestThreads.cpp b/lib/kokkos/core/unit_test/TestThreads.cpp new file mode 100755 index 0000000000000000000000000000000000000000..3832998ab5f04fdf91020691539872a48733b8fd --- /dev/null +++ b/lib/kokkos/core/unit_test/TestThreads.cpp @@ -0,0 +1,443 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Macros.hpp> + +#if defined( KOKKOS_HAVE_PTHREAD ) + +#include <Kokkos_Core.hpp> + +#include <Threads/Kokkos_Threads_TaskPolicy.hpp> + +//---------------------------------------------------------------------------- + +#include <TestSharedAlloc.hpp> +#include <TestViewMapping.hpp> + +#include <TestViewImpl.hpp> + +#include <TestViewAPI.hpp> +#include <TestViewSubview.hpp> +#include <TestAtomic.hpp> + +#include <TestReduce.hpp> +#include <TestScan.hpp> +#include <TestRange.hpp> +#include <TestTeam.hpp> +#include <TestAggregate.hpp> +#include <TestAggregateReduction.hpp> +#include <TestCompilerMacros.hpp> +#include <TestCXX11.hpp> +#include <TestCXX11Deduction.hpp> +#include <TestTeamVector.hpp> +#include <TestMemorySpaceTracking.hpp> +#include <TestTemplateMetaFunctions.hpp> + +#include <TestTaskPolicy.hpp> + +namespace Test { + +class threads : public ::testing::Test { +protected: + static void SetUpTestCase() + { + // Finalize without initialize is a no-op: + Kokkos::Threads::finalize(); + + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); + + unsigned threads_count = 0 ; + + // Initialize and finalize with no threads: + Kokkos::Threads::initialize( 1u ); + Kokkos::Threads::finalize(); + + threads_count = std::max( 1u , numa_count ) + * std::max( 2u , cores_per_numa * threads_per_core ); + + Kokkos::Threads::initialize( threads_count ); + Kokkos::Threads::finalize(); + + + threads_count = std::max( 1u , numa_count * 2 ) + * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 ); + + Kokkos::Threads::initialize( threads_count ); + Kokkos::Threads::finalize(); + + // Quick attempt to verify thread start/terminate don't have race condition: + threads_count = std::max( 1u , numa_count ) + * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 ); + for ( unsigned i = 0 ; i < 10 ; ++i ) { + Kokkos::Threads::initialize( threads_count ); + Kokkos::Threads::sleep(); + Kokkos::Threads::wake(); + Kokkos::Threads::finalize(); + } + + Kokkos::Threads::initialize( threads_count ); + Kokkos::Threads::print_configuration( std::cout , true /* detailed */ ); + } + + static void TearDownTestCase() + { + Kokkos::Threads::finalize(); + } +}; + +TEST_F( threads , init ) { + ; +} + +TEST_F( threads , impl_shared_alloc ) { + test_shared_alloc< Kokkos::HostSpace , Kokkos::Threads >(); +} + +TEST_F( threads , impl_view_mapping ) { + test_view_mapping< Kokkos::Threads >(); + test_view_mapping_subview< Kokkos::Threads >(); + test_view_mapping_operator< Kokkos::Threads >(); + TestViewMappingAtomic< Kokkos::Threads >::run(); +} + + +TEST_F( threads, view_impl) { + test_view_impl< Kokkos::Threads >(); +} + +TEST_F( threads, view_api) { + TestViewAPI< double , Kokkos::Threads >(); +} + +TEST_F( threads, view_subview_auto_1d_left ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Threads >(); +} + +TEST_F( threads, view_subview_auto_1d_right ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Threads >(); +} + +TEST_F( threads, view_subview_auto_1d_stride ) { + TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Threads >(); +} + +TEST_F( threads, view_subview_assign_strided ) { + TestViewSubview::test_1d_strided_assignment< Kokkos::Threads >(); +} + +TEST_F( threads, view_subview_left_0 ) { + TestViewSubview::test_left_0< Kokkos::Threads >(); +} + +TEST_F( threads, view_subview_left_1 ) { + TestViewSubview::test_left_1< Kokkos::Threads >(); +} + +TEST_F( threads, view_subview_left_2 ) { + TestViewSubview::test_left_2< Kokkos::Threads >(); +} + +TEST_F( threads, view_subview_left_3 ) { + TestViewSubview::test_left_3< Kokkos::Threads >(); +} + +TEST_F( threads, view_subview_right_0 ) { + TestViewSubview::test_right_0< Kokkos::Threads >(); +} + +TEST_F( threads, view_subview_right_1 ) { + TestViewSubview::test_right_1< Kokkos::Threads >(); +} + +TEST_F( threads, view_subview_right_3 ) { + TestViewSubview::test_right_3< Kokkos::Threads >(); +} + + +TEST_F( threads, view_aggregate ) { + TestViewAggregate< Kokkos::Threads >(); + TestViewAggregateReduction< Kokkos::Threads >(); +} + +TEST_F( threads , range_tag ) +{ + TestRange< Kokkos::Threads >::test_for(1000); + TestRange< Kokkos::Threads >::test_reduce(1000); + TestRange< Kokkos::Threads >::test_scan(1000); +} + +TEST_F( threads , team_tag ) +{ + TestTeamPolicy< Kokkos::Threads >::test_for(1000); + TestTeamPolicy< Kokkos::Threads >::test_reduce(1000); +} + +TEST_F( threads, long_reduce) { + TestReduce< long , Kokkos::Threads >( 1000000 ); +} + +TEST_F( threads, double_reduce) { + TestReduce< double , Kokkos::Threads >( 1000000 ); +} + +TEST_F( threads, team_long_reduce) { + TestReduceTeam< long , Kokkos::Threads >( 100000 ); +} + +TEST_F( threads, team_double_reduce) { + TestReduceTeam< double , Kokkos::Threads >( 100000 ); +} + +TEST_F( threads, long_reduce_dynamic ) { + TestReduceDynamic< long , Kokkos::Threads >( 1000000 ); +} + +TEST_F( threads, double_reduce_dynamic ) { + TestReduceDynamic< double , Kokkos::Threads >( 1000000 ); +} + +TEST_F( threads, long_reduce_dynamic_view ) { + TestReduceDynamicView< long , Kokkos::Threads >( 1000000 ); +} + +TEST_F( threads, team_shared_request) { + TestSharedTeam< Kokkos::Threads >(); +} + +TEST_F( threads , view_remap ) +{ + enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 }; + + typedef Kokkos::View< double*[N1][N2][N3] , + Kokkos::LayoutRight , + Kokkos::Threads > output_type ; + + typedef Kokkos::View< int**[N2][N3] , + Kokkos::LayoutLeft , + Kokkos::Threads > input_type ; + + typedef Kokkos::View< int*[N0][N2][N3] , + Kokkos::LayoutLeft , + Kokkos::Threads > diff_type ; + + output_type output( "output" , N0 ); + input_type input ( "input" , N0 , N1 ); + diff_type diff ( "diff" , N0 ); + + int value = 0 ; + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) { + input(i0,i1,i2,i3) = ++value ; + }}}} + + // Kokkos::deep_copy( diff , input ); // throw with incompatible shape + Kokkos::deep_copy( output , input ); + + value = 0 ; + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) { + ++value ; + ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) ); + }}}} +} + +//---------------------------------------------------------------------------- + +TEST_F( threads , atomics ) +{ + const int loop_count = 1e6 ; + + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,3) ) ); + + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,3) ) ); + +#if defined( KOKKOS_ENABLE_ASM ) + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,2) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,3) ) ); +#endif + + ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<3>, Kokkos::Threads>(loop_count,1) ) ); + ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<3>, Kokkos::Threads>(loop_count,2) ) ); +} + +//---------------------------------------------------------------------------- + +#if 0 +TEST_F( threads , scan_small ) +{ + typedef TestScan< Kokkos::Threads , Kokkos::Impl::ThreadsExecUseScanSmall > TestScanFunctor ; + for ( int i = 0 ; i < 1000 ; ++i ) { + TestScanFunctor( 10 ); + TestScanFunctor( 10000 ); + } + TestScanFunctor( 1000000 ); + TestScanFunctor( 10000000 ); + + Kokkos::Threads::fence(); +} +#endif + +TEST_F( threads , scan ) +{ + TestScan< Kokkos::Threads >::test_range( 1 , 1000 ); + TestScan< Kokkos::Threads >( 1000000 ); + TestScan< Kokkos::Threads >( 10000000 ); + Kokkos::Threads::fence(); +} + +//---------------------------------------------------------------------------- + +TEST_F( threads , team_scan ) +{ + TestScanTeam< Kokkos::Threads >( 10 ); + TestScanTeam< Kokkos::Threads >( 10000 ); +} + +//---------------------------------------------------------------------------- + +TEST_F( threads , compiler_macros ) +{ + ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Threads >() ) ); +} + +TEST_F( threads , memory_space ) +{ + TestMemorySpace< Kokkos::Threads >(); +} + +//---------------------------------------------------------------------------- + +TEST_F( threads , template_meta_functions ) +{ + TestTemplateMetaFunctions<int, Kokkos::Threads >(); +} + +//---------------------------------------------------------------------------- + +#if defined( KOKKOS_HAVE_CXX11 ) && defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) +TEST_F( threads , cxx11 ) +{ + if ( Kokkos::Impl::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Threads >::value ) { + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(1) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(2) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(3) ) ); + ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(4) ) ); + } +} +#endif + +#if defined (KOKKOS_HAVE_CXX11) + +TEST_F( threads , reduction_deduction ) +{ + TestCXX11::test_reduction_deduction< Kokkos::Threads >(); +} + +TEST_F( threads , team_vector ) +{ + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(0) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(1) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(2) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(3) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(4) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(5) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(6) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(7) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(8) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(9) ) ); + ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(10) ) ); +} + +#endif + +TEST_F( threads , task_policy ) +{ + TestTaskPolicy::test_task_dep< Kokkos::Threads >( 10 ); + for ( long i = 0 ; i < 25 ; ++i ) TestTaskPolicy::test_fib< Kokkos::Threads >(i); + for ( long i = 0 ; i < 35 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Threads >(i); +} + +#if defined( KOKKOS_HAVE_CXX11 ) +TEST_F( threads , task_team ) +{ + TestTaskPolicy::test_task_team< Kokkos::Threads >(1000); +} +#endif + + +} // namespace Test + +#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */ diff --git a/lib/kokkos/core/unit_test/TestTile.hpp b/lib/kokkos/core/unit_test/TestTile.hpp new file mode 100755 index 0000000000000000000000000000000000000000..dfb2bd81b3dec3485688f9827d3f1f7ad24ddb9d --- /dev/null +++ b/lib/kokkos/core/unit_test/TestTile.hpp @@ -0,0 +1,153 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef TEST_TILE_HPP +#define TEST_TILE_HPP + +#include <Kokkos_Core.hpp> + +namespace TestTile { + +template < typename Device , typename TileLayout> +struct ReduceTileErrors +{ + typedef Device execution_space ; + + typedef Kokkos::View< ptrdiff_t**, TileLayout, Device> array_type; + typedef Kokkos::View< ptrdiff_t[ TileLayout::N0 ][ TileLayout::N1 ], Kokkos::LayoutLeft , Device > tile_type ; + + array_type m_array ; + + typedef ptrdiff_t value_type; + + ReduceTileErrors( array_type a ) + : m_array(a) + {} + + + KOKKOS_INLINE_FUNCTION + static void init( value_type & errors ) + { + errors = 0; + } + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & errors , + const volatile value_type & src_errors ) + { + errors += src_errors; + } + + // Initialize + KOKKOS_INLINE_FUNCTION + void operator()( size_t iwork ) const + { + const size_t i = iwork % m_array.dimension_0(); + const size_t j = iwork / m_array.dimension_0(); + if ( j < m_array.dimension_1() ) { + m_array(i,j) = & m_array(i,j) - & m_array(0,0); + +// printf("m_array(%d,%d) = %d\n",int(i),int(j),int(m_array(i,j))); + + } + } + + // Verify: + KOKKOS_INLINE_FUNCTION + void operator()( size_t iwork , value_type & errors ) const + { + const size_t tile_dim0 = ( m_array.dimension_0() + TileLayout::N0 - 1 ) / TileLayout::N0 ; + const size_t tile_dim1 = ( m_array.dimension_1() + TileLayout::N1 - 1 ) / TileLayout::N1 ; + + const size_t itile = iwork % tile_dim0 ; + const size_t jtile = iwork / tile_dim0 ; + + if ( jtile < tile_dim1 ) { + + tile_type tile = Kokkos::tile_subview( m_array , itile , jtile ); + + if ( tile(0,0) != ptrdiff_t(( itile + jtile * tile_dim0 ) * TileLayout::N0 * TileLayout::N1 ) ) { + ++errors ; + } + else { + + for ( size_t j = 0 ; j < size_t(TileLayout::N1) ; ++j ) { + for ( size_t i = 0 ; i < size_t(TileLayout::N0) ; ++i ) { + const size_t iglobal = i + itile * TileLayout::N0 ; + const size_t jglobal = j + jtile * TileLayout::N1 ; + + if ( iglobal < m_array.dimension_0() && jglobal < m_array.dimension_1() ) { + if ( tile(i,j) != ptrdiff_t( tile(0,0) + i + j * TileLayout::N0 ) ) ++errors ; + +// printf("tile(%d,%d)(%d,%d) = %d\n",int(itile),int(jtile),int(i),int(j),int(tile(i,j))); + + } + } + } + } + } + } +}; + +template< class Space , unsigned N0 , unsigned N1 > +void test( const size_t dim0 , const size_t dim1 ) +{ + typedef Kokkos::LayoutTileLeft<N0,N1> array_layout ; + typedef ReduceTileErrors< Space , array_layout > functor_type ; + + const size_t tile_dim0 = ( dim0 + N0 - 1 ) / N0 ; + const size_t tile_dim1 = ( dim1 + N1 - 1 ) / N1 ; + + typename functor_type::array_type array("",dim0,dim1); + + Kokkos::parallel_for( Kokkos::RangePolicy<Space,size_t>(0,dim0*dim1) , functor_type( array ) ); + + ptrdiff_t error = 0 ; + + Kokkos::parallel_reduce( Kokkos::RangePolicy<Space,size_t>(0,tile_dim0*tile_dim1) , functor_type( array ) , error ); + + EXPECT_EQ( error , ptrdiff_t(0) ); +} + +} /* namespace TestTile */ + +#endif //TEST_TILE_HPP + diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp new file mode 100755 index 0000000000000000000000000000000000000000..b0a81cec6beefc38233685e506e514c1595dc4ef --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp @@ -0,0 +1,1305 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> + +/*--------------------------------------------------------------------------*/ + +#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + +namespace Test { + +template< typename T, class DeviceType > +class TestViewAPI { +public: + TestViewAPI() {} +}; + +} + +#else + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +template< class T , class L , class D , class M , class S > +size_t allocation_count( const Kokkos::View<T,L,D,M,S> & view ) +{ + const size_t card = Kokkos::Impl::cardinality_count( view.shape() ); + const size_t alloc = view.capacity(); + + return card <= alloc ? alloc : 0 ; +} + +/*--------------------------------------------------------------------------*/ + +template< typename T, class DeviceType> +struct TestViewOperator +{ + typedef DeviceType execution_space ; + + static const unsigned N = 100 ; + static const unsigned D = 3 ; + + typedef Kokkos::View< T*[D] , execution_space > view_type ; + + const view_type v1 ; + const view_type v2 ; + + TestViewOperator() + : v1( "v1" , N ) + , v2( "v2" , N ) + {} + + static void testit() + { + Kokkos::parallel_for( N , TestViewOperator() ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const unsigned i ) const + { + const unsigned X = 0 ; + const unsigned Y = 1 ; + const unsigned Z = 2 ; + + v2(i,X) = v1(i,X); + v2(i,Y) = v1(i,Y); + v2(i,Z) = v1(i,Z); + } +}; + +/*--------------------------------------------------------------------------*/ + +template< class DataType > +struct rank { +private: + typedef typename Kokkos::Impl::AnalyzeShape<DataType>::shape shape ; +public: + static const unsigned value = shape::rank ; +}; + +template< class DataType , + class DeviceType , + unsigned Rank = rank< DataType >::value > +struct TestViewOperator_LeftAndRight ; + +template< class DataType , class DeviceType > +struct TestViewOperator_LeftAndRight< DataType , DeviceType , 8 > +{ + typedef DeviceType execution_space ; + typedef typename execution_space::memory_space memory_space ; + typedef typename execution_space::size_type size_type ; + + typedef int value_type ; + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & update , + const volatile value_type & input ) + { update |= input ; } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & update ) + { update = 0 ; } + + + typedef Kokkos:: + View< DataType, Kokkos::LayoutLeft, execution_space > left_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutRight, execution_space > right_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutStride, execution_space > stride_view ; + + typedef typename left_view ::shape_type left_shape ; + typedef typename right_view::shape_type right_shape ; + + left_shape lsh ; + right_shape rsh ; + left_view left ; + right_view right ; + stride_view left_stride ; + stride_view right_stride ; + long left_alloc ; + long right_alloc ; + + TestViewOperator_LeftAndRight() + : lsh() + , rsh() + , left( "left" ) + , right( "right" ) + , left_stride( left ) + , right_stride( right ) + , left_alloc( allocation_count( left ) ) + , right_alloc( allocation_count( right ) ) + {} + + static void testit() + { + TestViewOperator_LeftAndRight driver ; + + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc ); + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc ); + + int error_flag = 0 ; + + Kokkos::parallel_reduce( 1 , driver , error_flag ); + + ASSERT_EQ( error_flag , 0 ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_type , value_type & update ) const + { + long offset ; + + offset = -1 ; + for ( unsigned i7 = 0 ; i7 < unsigned(lsh.N7) ; ++i7 ) + for ( unsigned i6 = 0 ; i6 < unsigned(lsh.N6) ; ++i6 ) + for ( unsigned i5 = 0 ; i5 < unsigned(lsh.N5) ; ++i5 ) + for ( unsigned i4 = 0 ; i4 < unsigned(lsh.N4) ; ++i4 ) + for ( unsigned i3 = 0 ; i3 < unsigned(lsh.N3) ; ++i3 ) + for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 ) + for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 ) + for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 ) + { + const long j = & left( i0, i1, i2, i3, i4, i5, i6, i7 ) - + & left( 0, 0, 0, 0, 0, 0, 0, 0 ); + if ( j <= offset || left_alloc <= j ) { update |= 1 ; } + offset = j ; + + if ( & left(i0,i1,i2,i3,i4,i5,i6,i7) != + & left_stride(i0,i1,i2,i3,i4,i5,i6,i7) ) { + update |= 4 ; + } + } + + offset = -1 ; + for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 ) + for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 ) + for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 ) + for ( unsigned i3 = 0 ; i3 < unsigned(rsh.N3) ; ++i3 ) + for ( unsigned i4 = 0 ; i4 < unsigned(rsh.N4) ; ++i4 ) + for ( unsigned i5 = 0 ; i5 < unsigned(rsh.N5) ; ++i5 ) + for ( unsigned i6 = 0 ; i6 < unsigned(rsh.N6) ; ++i6 ) + for ( unsigned i7 = 0 ; i7 < unsigned(rsh.N7) ; ++i7 ) + { + const long j = & right( i0, i1, i2, i3, i4, i5, i6, i7 ) - + & right( 0, 0, 0, 0, 0, 0, 0, 0 ); + if ( j <= offset || right_alloc <= j ) { update |= 2 ; } + offset = j ; + + if ( & right(i0,i1,i2,i3,i4,i5,i6,i7) != + & right_stride(i0,i1,i2,i3,i4,i5,i6,i7) ) { + update |= 8 ; + } + } + } +}; + +template< class DataType , class DeviceType > +struct TestViewOperator_LeftAndRight< DataType , DeviceType , 7 > +{ + typedef DeviceType execution_space ; + typedef typename execution_space::memory_space memory_space ; + typedef typename execution_space::size_type size_type ; + + typedef int value_type ; + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & update , + const volatile value_type & input ) + { update |= input ; } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & update ) + { update = 0 ; } + + + typedef Kokkos:: + View< DataType, Kokkos::LayoutLeft, execution_space > left_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutRight, execution_space > right_view ; + + typedef typename left_view ::shape_type left_shape ; + typedef typename right_view::shape_type right_shape ; + + left_shape lsh ; + right_shape rsh ; + left_view left ; + right_view right ; + long left_alloc ; + long right_alloc ; + + TestViewOperator_LeftAndRight() + : lsh() + , rsh() + , left( "left" ) + , right( "right" ) + , left_alloc( allocation_count( left ) ) + , right_alloc( allocation_count( right ) ) + {} + + static void testit() + { + TestViewOperator_LeftAndRight driver ; + + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc ); + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc ); + + int error_flag = 0 ; + + Kokkos::parallel_reduce( 1 , driver , error_flag ); + + ASSERT_EQ( error_flag , 0 ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_type , value_type & update ) const + { + long offset ; + + offset = -1 ; + for ( unsigned i6 = 0 ; i6 < unsigned(lsh.N6) ; ++i6 ) + for ( unsigned i5 = 0 ; i5 < unsigned(lsh.N5) ; ++i5 ) + for ( unsigned i4 = 0 ; i4 < unsigned(lsh.N4) ; ++i4 ) + for ( unsigned i3 = 0 ; i3 < unsigned(lsh.N3) ; ++i3 ) + for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 ) + for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 ) + for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 ) + { + const long j = & left( i0, i1, i2, i3, i4, i5, i6 ) - + & left( 0, 0, 0, 0, 0, 0, 0 ); + if ( j <= offset || left_alloc <= j ) { update |= 1 ; } + offset = j ; + } + + offset = -1 ; + for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 ) + for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 ) + for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 ) + for ( unsigned i3 = 0 ; i3 < unsigned(rsh.N3) ; ++i3 ) + for ( unsigned i4 = 0 ; i4 < unsigned(rsh.N4) ; ++i4 ) + for ( unsigned i5 = 0 ; i5 < unsigned(rsh.N5) ; ++i5 ) + for ( unsigned i6 = 0 ; i6 < unsigned(rsh.N6) ; ++i6 ) + { + const long j = & right( i0, i1, i2, i3, i4, i5, i6 ) - + & right( 0, 0, 0, 0, 0, 0, 0 ); + if ( j <= offset || right_alloc <= j ) { update |= 2 ; } + offset = j ; + } + } +}; + +template< class DataType , class DeviceType > +struct TestViewOperator_LeftAndRight< DataType , DeviceType , 6 > +{ + typedef DeviceType execution_space ; + typedef typename execution_space::memory_space memory_space ; + typedef typename execution_space::size_type size_type ; + + typedef int value_type ; + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & update , + const volatile value_type & input ) + { update |= input ; } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & update ) + { update = 0 ; } + + + typedef Kokkos:: + View< DataType, Kokkos::LayoutLeft, execution_space > left_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutRight, execution_space > right_view ; + + typedef typename left_view ::shape_type left_shape ; + typedef typename right_view::shape_type right_shape ; + + left_shape lsh ; + right_shape rsh ; + left_view left ; + right_view right ; + long left_alloc ; + long right_alloc ; + + TestViewOperator_LeftAndRight() + : lsh() + , rsh() + , left( "left" ) + , right( "right" ) + , left_alloc( allocation_count( left ) ) + , right_alloc( allocation_count( right ) ) + {} + + static void testit() + { + TestViewOperator_LeftAndRight driver ; + + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc ); + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc ); + + int error_flag = 0 ; + + Kokkos::parallel_reduce( 1 , driver , error_flag ); + + ASSERT_EQ( error_flag , 0 ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_type , value_type & update ) const + { + long offset ; + + offset = -1 ; + for ( unsigned i5 = 0 ; i5 < unsigned(lsh.N5) ; ++i5 ) + for ( unsigned i4 = 0 ; i4 < unsigned(lsh.N4) ; ++i4 ) + for ( unsigned i3 = 0 ; i3 < unsigned(lsh.N3) ; ++i3 ) + for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 ) + for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 ) + for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 ) + { + const long j = & left( i0, i1, i2, i3, i4, i5 ) - + & left( 0, 0, 0, 0, 0, 0 ); + if ( j <= offset || left_alloc <= j ) { update |= 1 ; } + offset = j ; + } + + offset = -1 ; + for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 ) + for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 ) + for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 ) + for ( unsigned i3 = 0 ; i3 < unsigned(rsh.N3) ; ++i3 ) + for ( unsigned i4 = 0 ; i4 < unsigned(rsh.N4) ; ++i4 ) + for ( unsigned i5 = 0 ; i5 < unsigned(rsh.N5) ; ++i5 ) + { + const long j = & right( i0, i1, i2, i3, i4, i5 ) - + & right( 0, 0, 0, 0, 0, 0 ); + if ( j <= offset || right_alloc <= j ) { update |= 2 ; } + offset = j ; + } + } +}; + +template< class DataType , class DeviceType > +struct TestViewOperator_LeftAndRight< DataType , DeviceType , 5 > +{ + typedef DeviceType execution_space ; + typedef typename execution_space::memory_space memory_space ; + typedef typename execution_space::size_type size_type ; + + typedef int value_type ; + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & update , + const volatile value_type & input ) + { update |= input ; } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & update ) + { update = 0 ; } + + + typedef Kokkos:: + View< DataType, Kokkos::LayoutLeft, execution_space > left_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutRight, execution_space > right_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutStride, execution_space > stride_view ; + + typedef typename left_view ::shape_type left_shape ; + typedef typename right_view::shape_type right_shape ; + + left_shape lsh ; + right_shape rsh ; + left_view left ; + right_view right ; + stride_view left_stride ; + stride_view right_stride ; + long left_alloc ; + long right_alloc ; + + TestViewOperator_LeftAndRight() + : lsh() + , rsh() + , left( "left" ) + , right( "right" ) + , left_stride( left ) + , right_stride( right ) + , left_alloc( allocation_count( left ) ) + , right_alloc( allocation_count( right ) ) + {} + + static void testit() + { + TestViewOperator_LeftAndRight driver ; + + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc ); + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc ); + + int error_flag = 0 ; + + Kokkos::parallel_reduce( 1 , driver , error_flag ); + + ASSERT_EQ( error_flag , 0 ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_type , value_type & update ) const + { + long offset ; + + offset = -1 ; + for ( unsigned i4 = 0 ; i4 < unsigned(lsh.N4) ; ++i4 ) + for ( unsigned i3 = 0 ; i3 < unsigned(lsh.N3) ; ++i3 ) + for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 ) + for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 ) + for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 ) + { + const long j = & left( i0, i1, i2, i3, i4 ) - + & left( 0, 0, 0, 0, 0 ); + if ( j <= offset || left_alloc <= j ) { update |= 1 ; } + offset = j ; + + if ( & left( i0, i1, i2, i3, i4 ) != + & left_stride( i0, i1, i2, i3, i4 ) ) { update |= 4 ; } + } + + offset = -1 ; + for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 ) + for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 ) + for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 ) + for ( unsigned i3 = 0 ; i3 < unsigned(rsh.N3) ; ++i3 ) + for ( unsigned i4 = 0 ; i4 < unsigned(rsh.N4) ; ++i4 ) + { + const long j = & right( i0, i1, i2, i3, i4 ) - + & right( 0, 0, 0, 0, 0 ); + if ( j <= offset || right_alloc <= j ) { update |= 2 ; } + offset = j ; + + if ( & right( i0, i1, i2, i3, i4 ) != + & right_stride( i0, i1, i2, i3, i4 ) ) { update |= 8 ; } + } + } +}; + +template< class DataType , class DeviceType > +struct TestViewOperator_LeftAndRight< DataType , DeviceType , 4 > +{ + typedef DeviceType execution_space ; + typedef typename execution_space::memory_space memory_space ; + typedef typename execution_space::size_type size_type ; + + typedef int value_type ; + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & update , + const volatile value_type & input ) + { update |= input ; } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & update ) + { update = 0 ; } + + + typedef Kokkos:: + View< DataType, Kokkos::LayoutLeft, execution_space > left_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutRight, execution_space > right_view ; + + typedef typename left_view ::shape_type left_shape ; + typedef typename right_view::shape_type right_shape ; + + left_shape lsh ; + right_shape rsh ; + left_view left ; + right_view right ; + long left_alloc ; + long right_alloc ; + + TestViewOperator_LeftAndRight() + : lsh() + , rsh() + , left( "left" ) + , right( "right" ) + , left_alloc( allocation_count( left ) ) + , right_alloc( allocation_count( right ) ) + {} + + static void testit() + { + TestViewOperator_LeftAndRight driver ; + + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc ); + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc ); + + int error_flag = 0 ; + + Kokkos::parallel_reduce( 1 , driver , error_flag ); + + ASSERT_EQ( error_flag , 0 ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_type , value_type & update ) const + { + long offset ; + + offset = -1 ; + for ( unsigned i3 = 0 ; i3 < unsigned(lsh.N3) ; ++i3 ) + for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 ) + for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 ) + for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 ) + { + const long j = & left( i0, i1, i2, i3 ) - + & left( 0, 0, 0, 0 ); + if ( j <= offset || left_alloc <= j ) { update |= 1 ; } + offset = j ; + } + + offset = -1 ; + for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 ) + for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 ) + for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 ) + for ( unsigned i3 = 0 ; i3 < unsigned(rsh.N3) ; ++i3 ) + { + const long j = & right( i0, i1, i2, i3 ) - + & right( 0, 0, 0, 0 ); + if ( j <= offset || right_alloc <= j ) { update |= 2 ; } + offset = j ; + } + } +}; + +template< class DataType , class DeviceType > +struct TestViewOperator_LeftAndRight< DataType , DeviceType , 3 > +{ + typedef DeviceType execution_space ; + typedef typename execution_space::memory_space memory_space ; + typedef typename execution_space::size_type size_type ; + + typedef int value_type ; + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & update , + const volatile value_type & input ) + { update |= input ; } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & update ) + { update = 0 ; } + + + typedef Kokkos:: + View< DataType, Kokkos::LayoutLeft, execution_space > left_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutRight, execution_space > right_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutStride, execution_space > stride_view ; + + typedef typename left_view ::shape_type left_shape ; + typedef typename right_view::shape_type right_shape ; + + left_shape lsh ; + right_shape rsh ; + left_view left ; + right_view right ; + stride_view left_stride ; + stride_view right_stride ; + long left_alloc ; + long right_alloc ; + + TestViewOperator_LeftAndRight() + : lsh() + , rsh() + , left( std::string("left") ) + , right( std::string("right") ) + , left_stride( left ) + , right_stride( right ) + , left_alloc( allocation_count( left ) ) + , right_alloc( allocation_count( right ) ) + {} + + static void testit() + { + TestViewOperator_LeftAndRight driver ; + + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc ); + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc ); + + int error_flag = 0 ; + + Kokkos::parallel_reduce( 1 , driver , error_flag ); + + ASSERT_EQ( error_flag , 0 ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_type , value_type & update ) const + { + long offset ; + + offset = -1 ; + for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 ) + for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 ) + for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 ) + { + const long j = & left( i0, i1, i2 ) - + & left( 0, 0, 0 ); + if ( j <= offset || left_alloc <= j ) { update |= 1 ; } + offset = j ; + + if ( & left(i0,i1,i2) != & left_stride(i0,i1,i2) ) { update |= 4 ; } + } + + offset = -1 ; + for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 ) + for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 ) + for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 ) + { + const long j = & right( i0, i1, i2 ) - + & right( 0, 0, 0 ); + if ( j <= offset || right_alloc <= j ) { update |= 2 ; } + offset = j ; + + if ( & right(i0,i1,i2) != & right_stride(i0,i1,i2) ) { update |= 8 ; } + } + + for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 ) + for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 ) + for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 ) + { + if ( & left(i0,i1,i2) != & left.at(i0,i1,i2,0,0,0,0,0) ) { update |= 3 ; } + if ( & right(i0,i1,i2) != & right.at(i0,i1,i2,0,0,0,0,0) ) { update |= 3 ; } + } + } +}; + +template< class DataType , class DeviceType > +struct TestViewOperator_LeftAndRight< DataType , DeviceType , 2 > +{ + typedef DeviceType execution_space ; + typedef typename execution_space::memory_space memory_space ; + typedef typename execution_space::size_type size_type ; + + typedef int value_type ; + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & update , + const volatile value_type & input ) + { update |= input ; } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & update ) + { update = 0 ; } + + + typedef Kokkos:: + View< DataType, Kokkos::LayoutLeft, execution_space > left_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutRight, execution_space > right_view ; + + typedef typename left_view ::shape_type left_shape ; + typedef typename right_view::shape_type right_shape ; + + left_shape lsh ; + right_shape rsh ; + left_view left ; + right_view right ; + long left_alloc ; + long right_alloc ; + + TestViewOperator_LeftAndRight() + : lsh() + , rsh() + , left( Kokkos::ViewAllocate("left") ) + , right( Kokkos::ViewAllocate("right") ) + , left_alloc( allocation_count( left ) ) + , right_alloc( allocation_count( right ) ) + {} + + static void testit() + { + TestViewOperator_LeftAndRight driver ; + + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc ); + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc ); + + int error_flag = 0 ; + + Kokkos::parallel_reduce( 1 , driver , error_flag ); + + ASSERT_EQ( error_flag , 0 ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_type , value_type & update ) const + { + long offset ; + + offset = -1 ; + for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 ) + for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 ) + { + const long j = & left( i0, i1 ) - + & left( 0, 0 ); + if ( j <= offset || left_alloc <= j ) { update |= 1 ; } + offset = j ; + } + + offset = -1 ; + for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 ) + for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 ) + { + const long j = & right( i0, i1 ) - + & right( 0, 0 ); + if ( j <= offset || right_alloc <= j ) { update |= 2 ; } + offset = j ; + } + + for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 ) + for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 ) + { + if ( & left(i0,i1) != & left.at(i0,i1,0,0,0,0,0,0) ) { update |= 3 ; } + if ( & right(i0,i1) != & right.at(i0,i1,0,0,0,0,0,0) ) { update |= 3 ; } + } + } +}; + +template< class DataType , class DeviceType > +struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 > +{ + typedef DeviceType execution_space ; + typedef typename execution_space::memory_space memory_space ; + typedef typename execution_space::size_type size_type ; + + typedef int value_type ; + + KOKKOS_INLINE_FUNCTION + static void join( volatile value_type & update , + const volatile value_type & input ) + { update |= input ; } + + KOKKOS_INLINE_FUNCTION + static void init( value_type & update ) + { update = 0 ; } + + + typedef Kokkos:: + View< DataType, Kokkos::LayoutLeft, execution_space > left_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutRight, execution_space > right_view ; + + typedef Kokkos:: + View< DataType, Kokkos::LayoutStride, execution_space > stride_view ; + + typedef typename left_view ::shape_type left_shape ; + typedef typename right_view::shape_type right_shape ; + + left_shape lsh ; + right_shape rsh ; + left_view left ; + right_view right ; + stride_view left_stride ; + stride_view right_stride ; + long left_alloc ; + long right_alloc ; + + TestViewOperator_LeftAndRight() + : lsh() + , rsh() + , left( Kokkos::ViewAllocate() ) + , right( Kokkos::ViewAllocate() ) + , left_stride( left ) + , right_stride( right ) + , left_alloc( allocation_count( left ) ) + , right_alloc( allocation_count( right ) ) + {} + + static void testit() + { + TestViewOperator_LeftAndRight driver ; + + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc ); + ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc ); + + int error_flag = 0 ; + + Kokkos::parallel_reduce( 1 , driver , error_flag ); + + ASSERT_EQ( error_flag , 0 ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const size_type , value_type & update ) const + { + for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 ) + { + if ( & left(i0) != & left.at(i0,0,0,0,0,0,0,0) ) { update |= 3 ; } + if ( & right(i0) != & right.at(i0,0,0,0,0,0,0,0) ) { update |= 3 ; } + if ( & left(i0) != & left_stride(i0) ) { update |= 4 ; } + if ( & right(i0) != & right_stride(i0) ) { update |= 8 ; } + } + } +}; + +/*--------------------------------------------------------------------------*/ + +template< typename T, class DeviceType > +class TestViewAPI +{ +public: + typedef DeviceType device ; + + enum { N0 = 1000 , + N1 = 3 , + N2 = 5 , + N3 = 7 }; + + typedef Kokkos::View< T , device > dView0 ; + typedef Kokkos::View< T* , device > dView1 ; + typedef Kokkos::View< T*[N1] , device > dView2 ; + typedef Kokkos::View< T*[N1][N2] , device > dView3 ; + typedef Kokkos::View< T*[N1][N2][N3] , device > dView4 ; + typedef Kokkos::View< const T*[N1][N2][N3] , device > const_dView4 ; + + typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged ; + + typedef typename dView0::host_mirror_space host ; + + TestViewAPI() + { + run_test_mirror(); + run_test(); + run_test_scalar(); + run_test_const(); + run_test_subview(); + run_test_subview_strided(); + run_test_vector(); + + TestViewOperator< T , device >::testit(); + TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2][3] , device >::testit(); + TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2] , device >::testit(); + TestViewOperator_LeftAndRight< int[2][3][4][2][3][4] , device >::testit(); + TestViewOperator_LeftAndRight< int[2][3][4][2][3] , device >::testit(); + TestViewOperator_LeftAndRight< int[2][3][4][2] , device >::testit(); + TestViewOperator_LeftAndRight< int[2][3][4] , device >::testit(); + TestViewOperator_LeftAndRight< int[2][3] , device >::testit(); + TestViewOperator_LeftAndRight< int[2] , device >::testit(); + } + + static void run_test_mirror() + { + typedef Kokkos::View< int , host > view_type ; + typedef typename view_type::HostMirror mirror_type ; + view_type a("a"); + mirror_type am = Kokkos::create_mirror_view(a); + mirror_type ax = Kokkos::create_mirror(a); + ASSERT_EQ( & a() , & am() ); + } + + static void run_test_scalar() + { + typedef typename dView0::HostMirror hView0 ; + + dView0 dx , dy ; + hView0 hx , hy ; + + dx = dView0( "dx" ); + dy = dView0( "dy" ); + + hx = Kokkos::create_mirror( dx ); + hy = Kokkos::create_mirror( dy ); + + hx = 1 ; + + Kokkos::deep_copy( dx , hx ); + Kokkos::deep_copy( dy , dx ); + Kokkos::deep_copy( hy , dy ); + + ASSERT_EQ( hx(), hy() ); + } + + static void run_test() + { + // mfh 14 Feb 2014: This test doesn't actually create instances of + // these types. In order to avoid "declared but unused typedef" + // warnings, we declare empty instances of these types, with the + // usual "(void)" marker to avoid compiler warnings for unused + // variables. + + typedef typename dView0::HostMirror hView0 ; + typedef typename dView1::HostMirror hView1 ; + typedef typename dView2::HostMirror hView2 ; + typedef typename dView3::HostMirror hView3 ; + typedef typename dView4::HostMirror hView4 ; + + { + hView0 thing; + (void) thing; + } + { + hView1 thing; + (void) thing; + } + { + hView2 thing; + (void) thing; + } + { + hView3 thing; + (void) thing; + } + { + hView4 thing; + (void) thing; + } + + dView4 dx , dy , dz ; + hView4 hx , hy , hz ; + + ASSERT_TRUE( dx.is_null() ); + ASSERT_TRUE( dy.is_null() ); + ASSERT_TRUE( dz.is_null() ); + ASSERT_TRUE( hx.is_null() ); + ASSERT_TRUE( hy.is_null() ); + ASSERT_TRUE( hz.is_null() ); + ASSERT_EQ( dx.dimension_0() , 0u ); + ASSERT_EQ( dy.dimension_0() , 0u ); + ASSERT_EQ( dz.dimension_0() , 0u ); + ASSERT_EQ( hx.dimension_0() , 0u ); + ASSERT_EQ( hy.dimension_0() , 0u ); + ASSERT_EQ( hz.dimension_0() , 0u ); + ASSERT_EQ( dx.dimension_1() , unsigned(N1) ); + ASSERT_EQ( dy.dimension_1() , unsigned(N1) ); + ASSERT_EQ( dz.dimension_1() , unsigned(N1) ); + ASSERT_EQ( hx.dimension_1() , unsigned(N1) ); + ASSERT_EQ( hy.dimension_1() , unsigned(N1) ); + ASSERT_EQ( hz.dimension_1() , unsigned(N1) ); + + dx = dView4( "dx" , N0 ); + dy = dView4( "dy" , N0 ); + + + + dView4_unmanaged unmanaged_dx = dx; + dView4_unmanaged unmanaged_from_ptr_dx = dView4_unmanaged(dx.ptr_on_device(), + dx.dimension_0(), + dx.dimension_1(), + dx.dimension_2(), + dx.dimension_3()); + + { + // Destruction of this view should be harmless + const_dView4 unmanaged_from_ptr_const_dx( dx.ptr_on_device() , + dx.dimension_0() , + dx.dimension_1() , + dx.dimension_2() , + dx.dimension_3() ); + } + + const_dView4 const_dx = dx ; + + + ASSERT_FALSE( dx.is_null() ); + ASSERT_FALSE( const_dx.is_null() ); + ASSERT_FALSE( unmanaged_dx.is_null() ); + ASSERT_FALSE( unmanaged_from_ptr_dx.is_null() ); + ASSERT_FALSE( dy.is_null() ); + ASSERT_NE( dx , dy ); + + ASSERT_EQ( dx.dimension_0() , unsigned(N0) ); + ASSERT_EQ( dx.dimension_1() , unsigned(N1) ); + ASSERT_EQ( dx.dimension_2() , unsigned(N2) ); + ASSERT_EQ( dx.dimension_3() , unsigned(N3) ); + + ASSERT_EQ( dy.dimension_0() , unsigned(N0) ); + ASSERT_EQ( dy.dimension_1() , unsigned(N1) ); + ASSERT_EQ( dy.dimension_2() , unsigned(N2) ); + ASSERT_EQ( dy.dimension_3() , unsigned(N3) ); + + ASSERT_EQ( unmanaged_from_ptr_dx.capacity(),unsigned(N0)*unsigned(N1)*unsigned(N2)*unsigned(N3) ); + + hx = Kokkos::create_mirror( dx ); + hy = Kokkos::create_mirror( dy ); + + // T v1 = hx() ; // Generates compile error as intended + // T v2 = hx(0,0) ; // Generates compile error as intended + // hx(0,0) = v2 ; // Generates compile error as intended + + size_t count = 0 ; + for ( size_t ip = 0 ; ip < N0 ; ++ip ) { + for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) { + hx(ip,i1,i2,i3) = ++count ; + }}}} + + Kokkos::deep_copy( dx , hx ); + Kokkos::deep_copy( dy , dx ); + Kokkos::deep_copy( hy , dy ); + + for ( size_t ip = 0 ; ip < N0 ; ++ip ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); } + }}}} + + Kokkos::deep_copy( dx , T(0) ); + Kokkos::deep_copy( hx , dx ); + + for ( size_t ip = 0 ; ip < N0 ; ++ip ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); } + }}}} + + dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz); + dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz); + + dx = dView4(); + ASSERT_TRUE( dx.is_null() ); + ASSERT_FALSE( dy.is_null() ); + ASSERT_FALSE( dz.is_null() ); + dy = dView4(); + ASSERT_TRUE( dx.is_null() ); + ASSERT_TRUE( dy.is_null() ); + ASSERT_FALSE( dz.is_null() ); + dz = dView4(); + ASSERT_TRUE( dx.is_null() ); + ASSERT_TRUE( dy.is_null() ); + ASSERT_TRUE( dz.is_null() ); + } + + typedef T DataType[2] ; + + static void + check_auto_conversion_to_const( + const Kokkos::View< const DataType , device > & arg_const , + const Kokkos::View< DataType , device > & arg ) + { + ASSERT_TRUE( arg_const == arg ); + } + + static void run_test_const() + { + typedef Kokkos::View< DataType , device > typeX ; + typedef Kokkos::View< const DataType , device > const_typeX ; + typedef Kokkos::View< const DataType , device , Kokkos::MemoryRandomAccess > const_typeR ; + typeX x( "X" ); + const_typeX xc = x ; + const_typeR xr = x ; + + ASSERT_TRUE( xc == x ); + ASSERT_TRUE( x == xc ); + ASSERT_TRUE( x.ptr_on_device() == xr.ptr_on_device() ); + + // typeX xf = xc ; // setting non-const from const must not compile + + check_auto_conversion_to_const( x , x ); + } + + static void run_test_subview() + { + typedef Kokkos::View< const T , device > sView ; + + dView0 d0( "d0" ); + dView1 d1( "d1" , N0 ); + dView2 d2( "d2" , N0 ); + dView3 d3( "d3" , N0 ); + dView4 d4( "d4" , N0 ); + + sView s0 = d0 ; + sView s1 = Kokkos::subview( d1 , 1 ); + sView s2 = Kokkos::subview( d2 , 1 , 1 ); + sView s3 = Kokkos::subview( d3 , 1 , 1 , 1 ); + sView s4 = Kokkos::subview( d4 , 1 , 1 , 1 , 1 ); + } + + static void run_test_subview_strided() + { + typedef Kokkos::View< int **** , Kokkos::LayoutLeft , host > view_left_4 ; + typedef Kokkos::View< int **** , Kokkos::LayoutRight , host > view_right_4 ; + typedef Kokkos::View< int ** , Kokkos::LayoutLeft , host > view_left_2 ; + typedef Kokkos::View< int ** , Kokkos::LayoutRight , host > view_right_2 ; + + typedef Kokkos::View< int * , Kokkos::LayoutStride , host > view_stride_1 ; + typedef Kokkos::View< int ** , Kokkos::LayoutStride , host > view_stride_2 ; + + view_left_2 xl2("xl2", 100 , 200 ); + view_right_2 xr2("xr2", 100 , 200 ); + view_stride_1 yl1 = Kokkos::subview( xl2 , 0 , Kokkos::ALL() ); + view_stride_1 yl2 = Kokkos::subview( xl2 , 1 , Kokkos::ALL() ); + view_stride_1 yr1 = Kokkos::subview( xr2 , 0 , Kokkos::ALL() ); + view_stride_1 yr2 = Kokkos::subview( xr2 , 1 , Kokkos::ALL() ); + + ASSERT_EQ( yl1.dimension_0() , xl2.dimension_1() ); + ASSERT_EQ( yl2.dimension_0() , xl2.dimension_1() ); + ASSERT_EQ( yr1.dimension_0() , xr2.dimension_1() ); + ASSERT_EQ( yr2.dimension_0() , xr2.dimension_1() ); + + ASSERT_EQ( & yl1(0) - & xl2(0,0) , 0 ); + ASSERT_EQ( & yl2(0) - & xl2(1,0) , 0 ); + ASSERT_EQ( & yr1(0) - & xr2(0,0) , 0 ); + ASSERT_EQ( & yr2(0) - & xr2(1,0) , 0 ); + + view_left_4 xl4( "xl4" , 10 , 20 , 30 , 40 ); + view_right_4 xr4( "xr4" , 10 , 20 , 30 , 40 ); + + view_stride_2 yl4 = Kokkos::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() ); + view_stride_2 yr4 = Kokkos::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() ); + + ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() ); + ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() ); + ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() ); + ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() ); + + ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 ); + ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 ); + } + + static void run_test_vector() + { + static const unsigned Length = 1000 , Count = 8 ; + + typedef Kokkos::View< T* , Kokkos::LayoutLeft , host > vector_type ; + typedef Kokkos::View< T** , Kokkos::LayoutLeft , host > multivector_type ; + + typedef Kokkos::View< T* , Kokkos::LayoutRight , host > vector_right_type ; + typedef Kokkos::View< T** , Kokkos::LayoutRight , host > multivector_right_type ; + + typedef Kokkos::View< const T* , Kokkos::LayoutRight, host > const_vector_right_type ; + typedef Kokkos::View< const T* , Kokkos::LayoutLeft , host > const_vector_type ; + typedef Kokkos::View< const T** , Kokkos::LayoutLeft , host > const_multivector_type ; + + multivector_type mv = multivector_type( "mv" , Length , Count ); + multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count ); + + vector_type v1 = Kokkos::subview( mv , Kokkos::ALL() , 0 ); + vector_type v2 = Kokkos::subview( mv , Kokkos::ALL() , 1 ); + vector_type v3 = Kokkos::subview( mv , Kokkos::ALL() , 2 ); + + vector_type rv1 = Kokkos::subview( mv_right , 0 , Kokkos::ALL() ); + vector_type rv2 = Kokkos::subview( mv_right , 1 , Kokkos::ALL() ); + vector_type rv3 = Kokkos::subview( mv_right , 2 , Kokkos::ALL() ); + + multivector_type mv1 = Kokkos::subview( mv , std::make_pair( 1 , 998 ) , + std::make_pair( 2 , 5 ) ); + + multivector_right_type mvr1 = + Kokkos::subview( mv_right , + std::make_pair( 1 , 998 ) , + std::make_pair( 2 , 5 ) ); + + const_vector_type cv1 = Kokkos::subview( mv , Kokkos::ALL(), 0 ); + const_vector_type cv2 = Kokkos::subview( mv , Kokkos::ALL(), 1 ); + const_vector_type cv3 = Kokkos::subview( mv , Kokkos::ALL(), 2 ); + + vector_right_type vr1 = Kokkos::subview( mv , Kokkos::ALL() , 0 ); + vector_right_type vr2 = Kokkos::subview( mv , Kokkos::ALL() , 1 ); + vector_right_type vr3 = Kokkos::subview( mv , Kokkos::ALL() , 2 ); + + const_vector_right_type cvr1 = Kokkos::subview( mv , Kokkos::ALL() , 0 ); + const_vector_right_type cvr2 = Kokkos::subview( mv , Kokkos::ALL() , 1 ); + const_vector_right_type cvr3 = Kokkos::subview( mv , Kokkos::ALL() , 2 ); + + ASSERT_TRUE( & v1[0] == & v1(0) ); + ASSERT_TRUE( & v1[0] == & mv(0,0) ); + ASSERT_TRUE( & v2[0] == & mv(0,1) ); + ASSERT_TRUE( & v3[0] == & mv(0,2) ); + + ASSERT_TRUE( & cv1[0] == & mv(0,0) ); + ASSERT_TRUE( & cv2[0] == & mv(0,1) ); + ASSERT_TRUE( & cv3[0] == & mv(0,2) ); + + ASSERT_TRUE( & vr1[0] == & mv(0,0) ); + ASSERT_TRUE( & vr2[0] == & mv(0,1) ); + ASSERT_TRUE( & vr3[0] == & mv(0,2) ); + + ASSERT_TRUE( & cvr1[0] == & mv(0,0) ); + ASSERT_TRUE( & cvr2[0] == & mv(0,1) ); + ASSERT_TRUE( & cvr3[0] == & mv(0,2) ); + + ASSERT_TRUE( & mv1(0,0) == & mv( 1 , 2 ) ); + ASSERT_TRUE( & mv1(1,1) == & mv( 2 , 3 ) ); + ASSERT_TRUE( & mv1(3,2) == & mv( 4 , 4 ) ); + ASSERT_TRUE( & mvr1(0,0) == & mv_right( 1 , 2 ) ); + ASSERT_TRUE( & mvr1(1,1) == & mv_right( 2 , 3 ) ); + ASSERT_TRUE( & mvr1(3,2) == & mv_right( 4 , 4 ) ); + + const_vector_type c_cv1( v1 ); + typename vector_type::const_type c_cv2( v2 ); + typename const_vector_type::const_type c_ccv2( v2 ); + + const_multivector_type cmv( mv ); + typename multivector_type::const_type cmvX( cmv ); + typename const_multivector_type::const_type ccmvX( cmv ); + } +}; + +} // namespace Test + +#endif + +/*--------------------------------------------------------------------------*/ + diff --git a/lib/kokkos/core/unit_test/TestViewImpl.hpp b/lib/kokkos/core/unit_test/TestViewImpl.hpp new file mode 100755 index 0000000000000000000000000000000000000000..c51588777be7e7694a27b1ba24ce1f0fc45c0dc1 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewImpl.hpp @@ -0,0 +1,289 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +/*--------------------------------------------------------------------------*/ + +#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + +namespace Test { + +template < class Device > +void test_view_impl() {} + +} + +#else + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +struct DummyMemorySpace +{ + typedef DummyMemorySpace memory_space ; + typedef unsigned size_type ; +}; + +/*--------------------------------------------------------------------------*/ + +template< class Type > +struct DefineShape { + typedef typename Kokkos::Impl::AnalyzeShape<Type>::shape type ; +}; + +template< class Type > +struct ExtractValueType { + typedef typename Kokkos::Impl::AnalyzeShape<Type>::value_type type ; +}; + +template< class Type > +struct ArrayType { typedef Type type ; }; + +template < class Device > +void test_view_impl() +{ + //typedef typename Device::memory_space memory_space ; // unused + + typedef ArrayType< int[100] >::type type_01 ; + typedef ArrayType< int* >::type type_11 ; + typedef ArrayType< int[5][6][700] >::type type_03 ; + typedef ArrayType< double*[8][9][900] >::type type_14 ; + typedef ArrayType< long** >::type type_22 ; + typedef ArrayType< short **[5][6][7] >::type type_25 ; + typedef ArrayType< const short **[5][6][7] >::type const_type_25 ; + typedef ArrayType< short***[5][6][7] >::type type_36 ; + typedef ArrayType< const short***[5][6][7] >::type const_type_36 ; + + // mfh 14 Feb 2014: With gcc 4.8.2 -Wall, this emits a warning: + // + // typedef ‘ok_const_25’ locally defined but not used [-Wunused-local-typedefs] + // + // It's unfortunate that this is the case, because the typedef is + // being used for a compile-time check! We deal with this by + // declaring an instance of ok_const_25, and marking it with + // "(void)" so that instance doesn't emit an "unused variable" + // warning. + // + // typedef typename Kokkos::Impl::StaticAssertSame< + // typename Kokkos::Impl::AnalyzeShape<type_25>::const_type , + // typename Kokkos::Impl::AnalyzeShape<const_type_25>::type + // > ok_const_25 ; + + typedef typename Kokkos::Impl::StaticAssertSame< + typename Kokkos::Impl::AnalyzeShape<type_25>::const_type, + typename Kokkos::Impl::AnalyzeShape<const_type_25>::type + > ok_const_25 ; + + typedef typename Kokkos::Impl::StaticAssertSame< + typename Kokkos::Impl::AnalyzeShape<type_36>::const_type, + typename Kokkos::Impl::AnalyzeShape<const_type_36>::type + > ok_const_36 ; + { + ok_const_25 thing_25 ; + ok_const_36 thing_36 ; + (void) thing_25 ; // silence warning for unused variable + (void) thing_36 ; // silence warning for unused variable + } + + ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_03>::type , int >::value ) ); + ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_14>::type , double >::value ) ); + ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_22>::type , long >::value ) ); + ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_36>::type , short >::value ) ); + + ASSERT_FALSE( ( Kokkos::Impl::is_same< ExtractValueType<type_36>::type , int >::value ) ); + + typedef typename DefineShape< type_01 >::type shape_01_type ; + typedef typename DefineShape< type_11 >::type shape_11_type ; + typedef typename DefineShape< type_03 >::type shape_03_type ; + typedef typename DefineShape< type_14 >::type shape_14_type ; + typedef typename DefineShape< type_22 >::type shape_22_type ; + typedef typename DefineShape< type_36 >::type shape_36_type ; + + ASSERT_TRUE( ( Kokkos::Impl::StaticAssert< shape_36_type::rank == 6 >::value ) ); + ASSERT_TRUE( ( Kokkos::Impl::StaticAssert< shape_03_type::rank == 3 >::value ) ); + + shape_01_type shape_01 ; shape_01_type::assign( shape_01 ); + shape_11_type shape_11 ; shape_11_type::assign( shape_11, 1000 ); + shape_03_type shape_03 ; shape_03_type::assign( shape_03 ); + shape_14_type shape_14 ; shape_14_type::assign( shape_14 , 0 ); + shape_22_type shape_22 ; shape_22_type::assign( shape_22 , 0 , 0 ); + shape_36_type shape_36 ; shape_36_type::assign( shape_36 , 10 , 20 , 30 ); + + ASSERT_TRUE( shape_01.rank_dynamic == 0u ); + ASSERT_TRUE( shape_01.rank == 1u ); + ASSERT_TRUE( shape_01.N0 == 100u ); + + ASSERT_TRUE( shape_11.rank_dynamic == 1u ); + ASSERT_TRUE( shape_11.rank == 1u ); + ASSERT_TRUE( shape_11.N0 == 1000u ); + + ASSERT_TRUE( shape_03.rank_dynamic == 0u ); + ASSERT_TRUE( shape_03.rank == 3u ); + ASSERT_TRUE( shape_03.N0 == 5u ); + ASSERT_TRUE( shape_03.N1 == 6u ); + ASSERT_TRUE( shape_03.N2 == 700u ); + + ASSERT_TRUE( shape_14.rank_dynamic == 1u ); + ASSERT_TRUE( shape_14.rank == 4u ); + ASSERT_TRUE( shape_14.N0 == 0u ); + ASSERT_TRUE( shape_14.N1 == 8u ); + ASSERT_TRUE( shape_14.N2 == 9u ); + ASSERT_TRUE( shape_14.N3 == 900u ); + + ASSERT_TRUE( shape_22.rank_dynamic == 2u ); + ASSERT_TRUE( shape_22.rank == 2u ); + ASSERT_TRUE( shape_22.N0 == 0u ); + ASSERT_TRUE( shape_22.N1 == 0u ); + + ASSERT_TRUE( shape_36.rank_dynamic == 3u ); + ASSERT_TRUE( shape_36.rank == 6u ); + ASSERT_TRUE( shape_36.N0 == 10u ); + ASSERT_TRUE( shape_36.N1 == 20u ); + ASSERT_TRUE( shape_36.N2 == 30u ); + ASSERT_TRUE( shape_36.N3 == 5u ); + ASSERT_TRUE( shape_36.N4 == 6u ); + ASSERT_TRUE( shape_36.N5 == 7u ); + + + ASSERT_TRUE( shape_01 == shape_01 ); + ASSERT_TRUE( shape_11 == shape_11 ); + ASSERT_TRUE( shape_36 == shape_36 ); + ASSERT_TRUE( shape_01 != shape_36 ); + ASSERT_TRUE( shape_22 != shape_36 ); + + //------------------------------------------------------------------------ + + typedef Kokkos::Impl::ViewOffset< shape_01_type , Kokkos::LayoutLeft > shape_01_left_offset ; + typedef Kokkos::Impl::ViewOffset< shape_11_type , Kokkos::LayoutLeft > shape_11_left_offset ; + typedef Kokkos::Impl::ViewOffset< shape_03_type , Kokkos::LayoutLeft > shape_03_left_offset ; + typedef Kokkos::Impl::ViewOffset< shape_14_type , Kokkos::LayoutLeft > shape_14_left_offset ; + typedef Kokkos::Impl::ViewOffset< shape_22_type , Kokkos::LayoutLeft > shape_22_left_offset ; + typedef Kokkos::Impl::ViewOffset< shape_36_type , Kokkos::LayoutLeft > shape_36_left_offset ; + + typedef Kokkos::Impl::ViewOffset< shape_01_type , Kokkos::LayoutRight > shape_01_right_offset ; + typedef Kokkos::Impl::ViewOffset< shape_11_type , Kokkos::LayoutRight > shape_11_right_offset ; + typedef Kokkos::Impl::ViewOffset< shape_03_type , Kokkos::LayoutRight > shape_03_right_offset ; + typedef Kokkos::Impl::ViewOffset< shape_14_type , Kokkos::LayoutRight > shape_14_right_offset ; + typedef Kokkos::Impl::ViewOffset< shape_22_type , Kokkos::LayoutRight > shape_22_right_offset ; + typedef Kokkos::Impl::ViewOffset< shape_36_type , Kokkos::LayoutRight > shape_36_right_offset ; + + ASSERT_TRUE( ! shape_01_left_offset::has_padding ); + ASSERT_TRUE( ! shape_11_left_offset::has_padding ); + ASSERT_TRUE( ! shape_03_left_offset::has_padding ); + ASSERT_TRUE( shape_14_left_offset::has_padding ); + ASSERT_TRUE( shape_22_left_offset::has_padding ); + ASSERT_TRUE( shape_36_left_offset::has_padding ); + + ASSERT_TRUE( ! shape_01_right_offset::has_padding ); + ASSERT_TRUE( ! shape_11_right_offset::has_padding ); + ASSERT_TRUE( ! shape_03_right_offset::has_padding ); + ASSERT_TRUE( ! shape_14_right_offset::has_padding ); + ASSERT_TRUE( shape_22_right_offset::has_padding ); + ASSERT_TRUE( shape_36_right_offset::has_padding ); + + //------------------------------------------------------------------------ + + typedef Kokkos::Impl::ViewOffset< shape_01_type , Kokkos::LayoutStride > shape_01_stride_offset ; + typedef Kokkos::Impl::ViewOffset< shape_36_type , Kokkos::LayoutStride > shape_36_stride_offset ; + + { + shape_01_stride_offset stride_offset_01 ; + + stride_offset_01.assign( 1, stride_offset_01.N0, 0,0,0,0,0,0,0 ); + + ASSERT_EQ( int(stride_offset_01.S[0]) , int(1) ); + ASSERT_EQ( int(stride_offset_01.S[1]) , int(stride_offset_01.N0) ); + } + + { + shape_36_stride_offset stride_offset_36 ; + + size_t str[7] ; + str[5] = 1 ; + str[4] = str[5] * stride_offset_36.N5 ; + str[3] = str[4] * stride_offset_36.N4 ; + str[2] = str[3] * stride_offset_36.N3 ; + str[1] = str[2] * 100 ; + str[0] = str[1] * 200 ; + str[6] = str[0] * 300 ; + + stride_offset_36.assign( str[0] , str[1] , str[2] , str[3] , str[4] , str[5] , str[6] , 0 , 0 ); + + ASSERT_EQ( size_t(stride_offset_36.S[6]) , size_t(str[6]) ); + ASSERT_EQ( size_t(stride_offset_36.N2) , size_t(100) ); + ASSERT_EQ( size_t(stride_offset_36.N1) , size_t(200) ); + ASSERT_EQ( size_t(stride_offset_36.N0) , size_t(300) ); + } + + //------------------------------------------------------------------------ + + { + const int rank = 6 ; + const int order[] = { 5 , 3 , 1 , 0 , 2 , 4 }; + const unsigned dim[] = { 2 , 3 , 5 , 7 , 11 , 13 }; + Kokkos::LayoutStride stride_6 = Kokkos::LayoutStride::order_dimensions( rank , order , dim ); + size_t n = 1 ; + for ( int i = 0 ; i < rank ; ++i ) { + ASSERT_EQ( size_t(dim[i]) , size_t( stride_6.dimension[i] ) ); + ASSERT_EQ( size_t(n) , size_t( stride_6.stride[ order[i] ] ) ); + n *= dim[order[i]] ; + } + } + + //------------------------------------------------------------------------ +} + +} /* namespace Test */ + +#endif + +/*--------------------------------------------------------------------------*/ + diff --git a/lib/kokkos/core/unit_test/TestViewMapping.hpp b/lib/kokkos/core/unit_test/TestViewMapping.hpp new file mode 100755 index 0000000000000000000000000000000000000000..31e0c6a7b04690382d1c608664680f089e54fb5a --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewMapping.hpp @@ -0,0 +1,1018 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +template< class RangeType > +void test_view_range( const size_t N , const RangeType & range , const size_t begin , const size_t dim ) +{ + typedef Kokkos::Experimental::Impl::ViewOffsetRange< RangeType > query ; + + ASSERT_EQ( query::begin( range ) , begin ); + ASSERT_EQ( query::dimension( N , range ) , dim ); + ASSERT_EQ( query::is_range , dim != 0 ); +} + + +template< class ExecSpace > +void test_view_mapping() +{ + typedef Kokkos::Experimental::Impl::ViewDimension<> dim_0 ; + typedef Kokkos::Experimental::Impl::ViewDimension<2> dim_s2 ; + typedef Kokkos::Experimental::Impl::ViewDimension<2,3> dim_s2_s3 ; + typedef Kokkos::Experimental::Impl::ViewDimension<2,3,4> dim_s2_s3_s4 ; + + typedef Kokkos::Experimental::Impl::ViewDimension<0> dim_s0 ; + typedef Kokkos::Experimental::Impl::ViewDimension<0,3> dim_s0_s3 ; + typedef Kokkos::Experimental::Impl::ViewDimension<0,3,4> dim_s0_s3_s4 ; + + typedef Kokkos::Experimental::Impl::ViewDimension<0,0> dim_s0_s0 ; + typedef Kokkos::Experimental::Impl::ViewDimension<0,0,4> dim_s0_s0_s4 ; + + typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0> dim_s0_s0_s0 ; + typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0> dim_s0_s0_s0_s0 ; + typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0> dim_s0_s0_s0_s0_s0 ; + typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0 ; + typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0_s0 ; + typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0_s0_s0 ; + + // Fully static dimensions should not be larger than an int + ASSERT_LE( sizeof(dim_0) , sizeof(int) ); + ASSERT_LE( sizeof(dim_s2) , sizeof(int) ); + ASSERT_LE( sizeof(dim_s2_s3) , sizeof(int) ); + ASSERT_LE( sizeof(dim_s2_s3_s4) , sizeof(int) ); + + // Rank 1 is size_t + ASSERT_EQ( sizeof(dim_s0) , sizeof(size_t) ); + ASSERT_EQ( sizeof(dim_s0_s3) , sizeof(size_t) ); + ASSERT_EQ( sizeof(dim_s0_s3_s4) , sizeof(size_t) ); + + // Allow for padding + ASSERT_LE( sizeof(dim_s0_s0) , 2 * sizeof(size_t) ); + ASSERT_LE( sizeof(dim_s0_s0_s4) , 2 * sizeof(size_t) ); + + ASSERT_LE( sizeof(dim_s0_s0_s0) , 4 * sizeof(size_t) ); + ASSERT_EQ( sizeof(dim_s0_s0_s0_s0) , 4 * sizeof(unsigned) ); + ASSERT_LE( sizeof(dim_s0_s0_s0_s0_s0) , 6 * sizeof(unsigned) ); + ASSERT_EQ( sizeof(dim_s0_s0_s0_s0_s0_s0) , 6 * sizeof(unsigned) ); + ASSERT_LE( sizeof(dim_s0_s0_s0_s0_s0_s0_s0) , 8 * sizeof(unsigned) ); + ASSERT_EQ( sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0) , 8 * sizeof(unsigned) ); + + ASSERT_EQ( int(dim_0::rank) , int(0) ); + ASSERT_EQ( int(dim_0::rank_dynamic) , int(0) ); + + ASSERT_EQ( int(dim_s2::rank) , int(1) ); + ASSERT_EQ( int(dim_s2::rank_dynamic) , int(0) ); + + ASSERT_EQ( int(dim_s2_s3::rank) , int(2) ); + ASSERT_EQ( int(dim_s2_s3::rank_dynamic) , int(0) ); + + ASSERT_EQ( int(dim_s2_s3_s4::rank) , int(3) ); + ASSERT_EQ( int(dim_s2_s3_s4::rank_dynamic) , int(0) ); + + ASSERT_EQ( int(dim_s0::rank) , int(1) ); + ASSERT_EQ( int(dim_s0::rank_dynamic) , int(1) ); + + ASSERT_EQ( int(dim_s0_s3::rank) , int(2) ); + ASSERT_EQ( int(dim_s0_s3::rank_dynamic) , int(1) ); + + ASSERT_EQ( int(dim_s0_s3_s4::rank) , int(3) ); + ASSERT_EQ( int(dim_s0_s3_s4::rank_dynamic) , int(1) ); + + ASSERT_EQ( int(dim_s0_s0_s4::rank) , int(3) ); + ASSERT_EQ( int(dim_s0_s0_s4::rank_dynamic) , int(2) ); + + ASSERT_EQ( int(dim_s0_s0_s0::rank) , int(3) ); + ASSERT_EQ( int(dim_s0_s0_s0::rank_dynamic) , int(3) ); + + ASSERT_EQ( int(dim_s0_s0_s0_s0::rank) , int(4) ); + ASSERT_EQ( int(dim_s0_s0_s0_s0::rank_dynamic) , int(4) ); + + ASSERT_EQ( int(dim_s0_s0_s0_s0_s0::rank) , int(5) ); + ASSERT_EQ( int(dim_s0_s0_s0_s0_s0::rank_dynamic) , int(5) ); + + ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0::rank) , int(6) ); + ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) , int(6) ); + + ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0::rank) , int(7) ); + ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) , int(7) ); + + ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) , int(8) ); + ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) , int(8) ); + + dim_s0 d1( 2, 3, 4, 5, 6, 7, 8, 9 ); + dim_s0_s0 d2( 2, 3, 4, 5, 6, 7, 8, 9 ); + dim_s0_s0_s0 d3( 2, 3, 4, 5, 6, 7, 8, 9 ); + dim_s0_s0_s0_s0 d4( 2, 3, 4, 5, 6, 7, 8, 9 ); + + ASSERT_EQ( d1.N0 , 2 ); + ASSERT_EQ( d2.N0 , 2 ); + ASSERT_EQ( d3.N0 , 2 ); + ASSERT_EQ( d4.N0 , 2 ); + + ASSERT_EQ( d1.N1 , 1 ); + ASSERT_EQ( d2.N1 , 3 ); + ASSERT_EQ( d3.N1 , 3 ); + ASSERT_EQ( d4.N1 , 3 ); + + ASSERT_EQ( d1.N2 , 1 ); + ASSERT_EQ( d2.N2 , 1 ); + ASSERT_EQ( d3.N2 , 4 ); + ASSERT_EQ( d4.N2 , 4 ); + + ASSERT_EQ( d1.N3 , 1 ); + ASSERT_EQ( d2.N3 , 1 ); + ASSERT_EQ( d3.N3 , 1 ); + ASSERT_EQ( d4.N3 , 5 ); + + //---------------------------------------- + + typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s0 , Kokkos::LayoutStride > stride_s0_s0_s0 ; + + //---------------------------------------- + // Static dimension + { + typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4 , Kokkos::LayoutLeft > left_s2_s3_s4 ; + + ASSERT_EQ( sizeof(left_s2_s3_s4) , sizeof(dim_s2_s3_s4) ); + + left_s2_s3_s4 off3 ; + + stride_s0_s0_s0 stride3( off3 ); + + ASSERT_EQ( off3.stride_0() , 1 ); + ASSERT_EQ( off3.stride_1() , 2 ); + ASSERT_EQ( off3.stride_2() , 6 ); + ASSERT_EQ( off3.span() , 24 ); + + ASSERT_EQ( off3.stride_0() , stride3.stride_0() ); + ASSERT_EQ( off3.stride_1() , stride3.stride_1() ); + ASSERT_EQ( off3.stride_2() , stride3.stride_2() ); + ASSERT_EQ( off3.span() , stride3.span() ); + + int offset = 0 ; + + for ( int k = 0 ; k < 4 ; ++k ){ + for ( int j = 0 ; j < 3 ; ++j ){ + for ( int i = 0 ; i < 2 ; ++i , ++offset ){ + ASSERT_EQ( off3(i,j,k) , offset ); + ASSERT_EQ( stride3(i,j,k) , off3(i,j,k) ); + }}} + } + + //---------------------------------------- + // Small dimension is unpadded + { + typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ; + + left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), 2, 3, 0, 0, 0, 0, 0, 0 ); + + stride_s0_s0_s0 stride3( dyn_off3 ); + + ASSERT_EQ( dyn_off3.m_dim.rank , 3 ); + ASSERT_EQ( dyn_off3.m_dim.N0 , 2 ); + ASSERT_EQ( dyn_off3.m_dim.N1 , 3 ); + ASSERT_EQ( dyn_off3.m_dim.N2 , 4 ); + ASSERT_EQ( dyn_off3.m_dim.N3 , 1 ); + ASSERT_EQ( dyn_off3.size() , 2 * 3 * 4 ); + + ASSERT_EQ( stride3.m_dim.rank , 3 ); + ASSERT_EQ( stride3.m_dim.N0 , 2 ); + ASSERT_EQ( stride3.m_dim.N1 , 3 ); + ASSERT_EQ( stride3.m_dim.N2 , 4 ); + ASSERT_EQ( stride3.m_dim.N3 , 1 ); + ASSERT_EQ( stride3.size() , 2 * 3 * 4 ); + + int offset = 0 ; + + for ( int k = 0 ; k < 4 ; ++k ){ + for ( int j = 0 ; j < 3 ; ++j ){ + for ( int i = 0 ; i < 2 ; ++i , ++offset ){ + ASSERT_EQ( offset , dyn_off3(i,j,k) ); + ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) ); + }}} + + ASSERT_EQ( dyn_off3.span() , offset ); + ASSERT_EQ( stride3.span() , dyn_off3.span() ); + } + + // Large dimension is likely padded + { + constexpr int N0 = 2000 ; + constexpr int N1 = 300 ; + + typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ; + + left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), N0, N1, 0, 0, 0, 0, 0, 0 ); + + stride_s0_s0_s0 stride3( dyn_off3 ); + + ASSERT_EQ( dyn_off3.m_dim.rank , 3 ); + ASSERT_EQ( dyn_off3.m_dim.N0 , N0 ); + ASSERT_EQ( dyn_off3.m_dim.N1 , N1 ); + ASSERT_EQ( dyn_off3.m_dim.N2 , 4 ); + ASSERT_EQ( dyn_off3.m_dim.N3 , 1 ); + ASSERT_EQ( dyn_off3.size() , N0 * N1 * 4 ); + + ASSERT_EQ( stride3.m_dim.rank , 3 ); + ASSERT_EQ( stride3.m_dim.N0 , N0 ); + ASSERT_EQ( stride3.m_dim.N1 , N1 ); + ASSERT_EQ( stride3.m_dim.N2 , 4 ); + ASSERT_EQ( stride3.m_dim.N3 , 1 ); + ASSERT_EQ( stride3.size() , N0 * N1 * 4 ); + ASSERT_EQ( stride3.span() , dyn_off3.span() ); + + int offset = 0 ; + + for ( int k = 0 ; k < 4 ; ++k ){ + for ( int j = 0 ; j < N1 ; ++j ){ + for ( int i = 0 ; i < N0 ; ++i ){ + ASSERT_LE( offset , dyn_off3(i,j,k) ); + ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) ); + offset = dyn_off3(i,j,k) + 1 ; + }}} + + ASSERT_LE( offset , dyn_off3.span() ); + } + + //---------------------------------------- + // Static dimension + { + typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4 , Kokkos::LayoutRight > right_s2_s3_s4 ; + + ASSERT_EQ( sizeof(right_s2_s3_s4) , sizeof(dim_s2_s3_s4) ); + + right_s2_s3_s4 off3 ; + + stride_s0_s0_s0 stride3( off3 ); + + ASSERT_EQ( off3.stride_0() , 12 ); + ASSERT_EQ( off3.stride_1() , 4 ); + ASSERT_EQ( off3.stride_2() , 1 ); + + ASSERT_EQ( off3.dimension_0() , stride3.dimension_0() ); + ASSERT_EQ( off3.dimension_1() , stride3.dimension_1() ); + ASSERT_EQ( off3.dimension_2() , stride3.dimension_2() ); + ASSERT_EQ( off3.stride_0() , stride3.stride_0() ); + ASSERT_EQ( off3.stride_1() , stride3.stride_1() ); + ASSERT_EQ( off3.stride_2() , stride3.stride_2() ); + ASSERT_EQ( off3.span() , stride3.span() ); + + int offset = 0 ; + + for ( int i = 0 ; i < 2 ; ++i ){ + for ( int j = 0 ; j < 3 ; ++j ){ + for ( int k = 0 ; k < 4 ; ++k , ++offset ){ + ASSERT_EQ( off3(i,j,k) , offset ); + ASSERT_EQ( off3(i,j,k) , stride3(i,j,k) ); + }}} + + ASSERT_EQ( off3.span() , offset ); + } + + //---------------------------------------- + // Small dimension is unpadded + { + typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ; + + right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), 2, 3, 0, 0, 0, 0, 0, 0 ); + + stride_s0_s0_s0 stride3( dyn_off3 ); + + ASSERT_EQ( dyn_off3.m_dim.rank , 3 ); + ASSERT_EQ( dyn_off3.m_dim.N0 , 2 ); + ASSERT_EQ( dyn_off3.m_dim.N1 , 3 ); + ASSERT_EQ( dyn_off3.m_dim.N2 , 4 ); + ASSERT_EQ( dyn_off3.m_dim.N3 , 1 ); + ASSERT_EQ( dyn_off3.size() , 2 * 3 * 4 ); + + ASSERT_EQ( dyn_off3.dimension_0() , stride3.dimension_0() ); + ASSERT_EQ( dyn_off3.dimension_1() , stride3.dimension_1() ); + ASSERT_EQ( dyn_off3.dimension_2() , stride3.dimension_2() ); + ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() ); + ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() ); + ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() ); + ASSERT_EQ( dyn_off3.span() , stride3.span() ); + + int offset = 0 ; + + for ( int i = 0 ; i < 2 ; ++i ){ + for ( int j = 0 ; j < 3 ; ++j ){ + for ( int k = 0 ; k < 4 ; ++k , ++offset ){ + ASSERT_EQ( offset , dyn_off3(i,j,k) ); + ASSERT_EQ( dyn_off3(i,j,k) , stride3(i,j,k) ); + }}} + + ASSERT_EQ( dyn_off3.span() , offset ); + } + + // Large dimension is likely padded + { + constexpr int N0 = 2000 ; + constexpr int N1 = 300 ; + + typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ; + + right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), N0, N1, 0, 0, 0, 0, 0, 0 ); + + stride_s0_s0_s0 stride3( dyn_off3 ); + + ASSERT_EQ( dyn_off3.m_dim.rank , 3 ); + ASSERT_EQ( dyn_off3.m_dim.N0 , N0 ); + ASSERT_EQ( dyn_off3.m_dim.N1 , N1 ); + ASSERT_EQ( dyn_off3.m_dim.N2 , 4 ); + ASSERT_EQ( dyn_off3.m_dim.N3 , 1 ); + ASSERT_EQ( dyn_off3.size() , N0 * N1 * 4 ); + + ASSERT_EQ( dyn_off3.dimension_0() , stride3.dimension_0() ); + ASSERT_EQ( dyn_off3.dimension_1() , stride3.dimension_1() ); + ASSERT_EQ( dyn_off3.dimension_2() , stride3.dimension_2() ); + ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() ); + ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() ); + ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() ); + ASSERT_EQ( dyn_off3.span() , stride3.span() ); + + int offset = 0 ; + + for ( int i = 0 ; i < N0 ; ++i ){ + for ( int j = 0 ; j < N1 ; ++j ){ + for ( int k = 0 ; k < 4 ; ++k ){ + ASSERT_LE( offset , dyn_off3(i,j,k) ); + ASSERT_EQ( dyn_off3(i,j,k) , stride3(i,j,k) ); + offset = dyn_off3(i,j,k) + 1 ; + }}} + + ASSERT_LE( offset , dyn_off3.span() ); + } + + //---------------------------------------- + // Subview + { + constexpr int N0 = 2000 ; + constexpr int N1 = 300 ; + + constexpr int sub_N0 = 1000 ; + constexpr int sub_N1 = 200 ; + constexpr int sub_N2 = 4 ; + + typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ; + + left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), N0, N1, 0, 0, 0, 0, 0, 0 ); + + stride_s0_s0_s0 stride3( dyn_off3 , sub_N0 , sub_N1 , sub_N2 , 0 , 0 , 0 , 0 , 0 ); + + ASSERT_EQ( stride3.dimension_0() , sub_N0 ); + ASSERT_EQ( stride3.dimension_1() , sub_N1 ); + ASSERT_EQ( stride3.dimension_2() , sub_N2 ); + ASSERT_EQ( stride3.size() , sub_N0 * sub_N1 * sub_N2 ); + + ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() ); + ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() ); + ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() ); + ASSERT_GE( dyn_off3.span() , stride3.span() ); + + for ( int k = 0 ; k < sub_N2 ; ++k ){ + for ( int j = 0 ; j < sub_N1 ; ++j ){ + for ( int i = 0 ; i < sub_N0 ; ++i ){ + ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) ); + }}} + } + + { + constexpr int N0 = 2000 ; + constexpr int N1 = 300 ; + + constexpr int sub_N0 = 1000 ; + constexpr int sub_N1 = 200 ; + constexpr int sub_N2 = 4 ; + + typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ; + + right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), N0, N1, 0, 0, 0, 0, 0, 0 ); + + stride_s0_s0_s0 stride3( dyn_off3 , sub_N0 , sub_N1 , sub_N2 , 0 , 0 , 0 , 0 , 0 ); + + ASSERT_EQ( stride3.dimension_0() , sub_N0 ); + ASSERT_EQ( stride3.dimension_1() , sub_N1 ); + ASSERT_EQ( stride3.dimension_2() , sub_N2 ); + ASSERT_EQ( stride3.size() , sub_N0 * sub_N1 * sub_N2 ); + + ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() ); + ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() ); + ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() ); + ASSERT_GE( dyn_off3.span() , stride3.span() ); + + for ( int i = 0 ; i < sub_N0 ; ++i ){ + for ( int j = 0 ; j < sub_N1 ; ++j ){ + for ( int k = 0 ; k < sub_N2 ; ++k ){ + ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) ); + }}} + } + + //---------------------------------------- + { + constexpr int N = 1000 ; + + test_view_range( N , N / 2 , N / 2 , 0 ); + test_view_range( N , Kokkos::Experimental::ALL , 0 , N ); + test_view_range( N , std::pair<int,int>( N / 4 , 10 + N / 4 ) , N / 4 , 10 ); + test_view_range( N , Kokkos::pair<int,int>( N / 4 , 10 + N / 4 ) , N / 4 , 10 ); + } + //---------------------------------------- + // view data analysis + + { + typedef Kokkos::Experimental::Impl::ViewDataAnalysis< const int[] > a_const_int_r1 ; + + ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::specialize , void >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::dimension , Kokkos::Experimental::Impl::ViewDimension<0> >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::type , const int[] >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::value_type , const int >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::array_scalar_type , const int[] >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::const_type , const int[] >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::const_value_type , const int >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::const_array_scalar_type , const int[] >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::non_const_type , int [] >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::non_const_value_type , int >::value )); + + typedef Kokkos::Experimental::Impl::ViewDataAnalysis< const int**[4] > a_const_int_r3 ; + + ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::specialize , void >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::dimension , Kokkos::Experimental::Impl::ViewDimension<0,0,4> >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::type , const int**[4] >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::value_type , const int >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::array_scalar_type , const int**[4] >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::const_type , const int**[4] >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::const_value_type , const int >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::const_array_scalar_type , const int**[4] >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::non_const_type , int**[4] >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::non_const_value_type , int >::value )); + ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::non_const_array_scalar_type , int**[4] >::value )); + } + + //---------------------------------------- + + { + constexpr int N = 10 ; + + typedef Kokkos::Experimental::View<int*,ExecSpace> T ; + typedef Kokkos::Experimental::View<const int*,ExecSpace> C ; + + int data[N] ; + + T vr1(data,N); + C cr1(vr1); + + // Generate static_assert error: + // T tmp( cr1 ); + + ASSERT_EQ( vr1.span() , N ); + ASSERT_EQ( cr1.span() , N ); + ASSERT_EQ( vr1.data() , & data[0] ); + ASSERT_EQ( cr1.data() , & data[0] ); + + ASSERT_TRUE( ( std::is_same< typename T::data_type , int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::const_data_type , const int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type , int* >::value ) ); + + ASSERT_TRUE( ( std::is_same< typename T::array_scalar_type , int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::const_array_scalar_type , const int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::non_const_array_scalar_type , int* >::value ) ); + + ASSERT_TRUE( ( std::is_same< typename T::value_type , int >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::const_value_type , const int >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type , int >::value ) ); + + ASSERT_TRUE( ( std::is_same< typename T::memory_space , typename ExecSpace::memory_space >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::reference_type , int & >::value ) ); + + ASSERT_EQ( T::Rank , 1 ); + + ASSERT_TRUE( ( std::is_same< typename C::data_type , const int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename C::const_data_type , const int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename C::non_const_data_type , int* >::value ) ); + + ASSERT_TRUE( ( std::is_same< typename C::array_scalar_type , const int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename C::const_array_scalar_type , const int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename C::non_const_array_scalar_type , int* >::value ) ); + + ASSERT_TRUE( ( std::is_same< typename C::value_type , const int >::value ) ); + ASSERT_TRUE( ( std::is_same< typename C::const_value_type , const int >::value ) ); + ASSERT_TRUE( ( std::is_same< typename C::non_const_value_type , int >::value ) ); + + ASSERT_TRUE( ( std::is_same< typename C::memory_space , typename ExecSpace::memory_space >::value ) ); + ASSERT_TRUE( ( std::is_same< typename C::reference_type , const int & >::value ) ); + + ASSERT_EQ( C::Rank , 1 ); + + ASSERT_EQ( vr1.dimension_0() , N ); + + if ( Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename ExecSpace::memory_space , Kokkos::HostSpace >::value ) { + for ( int i = 0 ; i < N ; ++i ) data[i] = i + 1 ; + for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 1 ); + for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( cr1[i] , i + 1 ); + + { + T tmp( vr1 ); + for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 1 ); + for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 2 ; + for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 2 ); + } + + for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 2 ); + } + } + + { + constexpr int N = 10 ; + typedef Kokkos::Experimental::View<int*,ExecSpace> T ; + typedef Kokkos::Experimental::View<const int*,ExecSpace> C ; + + T vr1("vr1",N); + C cr1(vr1); + + ASSERT_TRUE( ( std::is_same< typename T::data_type , int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::const_data_type , const int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type , int* >::value ) ); + + ASSERT_TRUE( ( std::is_same< typename T::array_scalar_type , int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::const_array_scalar_type , const int* >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::non_const_array_scalar_type , int* >::value ) ); + + ASSERT_TRUE( ( std::is_same< typename T::value_type , int >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::const_value_type , const int >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type , int >::value ) ); + + ASSERT_TRUE( ( std::is_same< typename T::memory_space , typename ExecSpace::memory_space >::value ) ); + ASSERT_TRUE( ( std::is_same< typename T::reference_type , int & >::value ) ); + ASSERT_EQ( T::Rank , 1 ); + + ASSERT_EQ( vr1.dimension_0() , N ); + + if ( Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename ExecSpace::memory_space , Kokkos::HostSpace >::value ) { + for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 1 ; + for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 1 ); + for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( cr1[i] , i + 1 ); + + { + T tmp( vr1 ); + for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 1 ); + for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 2 ; + for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 2 ); + } + + for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 2 ); + } + } + + { + using namespace Kokkos::Experimental ; + + typedef typename ExecSpace::memory_space memory_space ; + typedef View<int*,memory_space> V ; + + constexpr int N = 10 ; + + memory_space mem_space ; + + V v( "v" , N ); + V va( view_alloc() , N ); + V vb( view_alloc( "vb" ) , N ); + V vc( view_alloc( "vc" , AllowPadding ) , N ); + V vd( view_alloc( "vd" , WithoutInitializing ) , N ); + V ve( view_alloc( "ve" , WithoutInitializing , AllowPadding ) , N ); + V vf( view_alloc( "vf" , mem_space , WithoutInitializing , AllowPadding ) , N ); + V vg( view_alloc( mem_space , "vg" , WithoutInitializing , AllowPadding ) , N ); + V vh( view_alloc( WithoutInitializing , AllowPadding ) , N ); + V vi( view_alloc( WithoutInitializing ) , N ); + V vj( view_alloc( std::string("vj") , AllowPadding ) , N ); + V vk( view_alloc( mem_space , std::string("vk") , AllowPadding ) , N ); + } + + { + typedef Kokkos::Experimental::ViewTraits<int***,Kokkos::LayoutStride,ExecSpace> traits_t ; + typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0> dims_t ; + typedef Kokkos::Experimental::Impl::ViewOffset< dims_t , Kokkos::LayoutStride > offset_t ; + + Kokkos::LayoutStride stride ; + + stride.dimension[0] = 3 ; + stride.dimension[1] = 4 ; + stride.dimension[2] = 5 ; + stride.stride[0] = 4 ; + stride.stride[1] = 1 ; + stride.stride[2] = 12 ; + + const offset_t offset( stride ); + + ASSERT_EQ( offset.dimension_0() , 3 ); + ASSERT_EQ( offset.dimension_1() , 4 ); + ASSERT_EQ( offset.dimension_2() , 5 ); + + ASSERT_EQ( offset.stride_0() , 4 ); + ASSERT_EQ( offset.stride_1() , 1 ); + ASSERT_EQ( offset.stride_2() , 12 ); + + ASSERT_EQ( offset.span() , 60 ); + ASSERT_TRUE( offset.span_is_contiguous() ); + + Kokkos::Experimental::Impl::ViewMapping< traits_t , void > v( (int*) 0 , std::false_type() , stride ); + } + + { + typedef Kokkos::Experimental::View<int**,ExecSpace> V ; + typedef typename V::HostMirror M ; + + constexpr int N0 = 10 ; + constexpr int N1 = 11 ; + + V a("a",N0,N1); + M b = Kokkos::Experimental::create_mirror(a); + M c = Kokkos::Experimental::create_mirror_view(a); + + for ( int i0 = 0 ; i0 < N0 ; ++i0 ) + for ( int i1 = 0 ; i1 < N1 ; ++i1 ) + b(i0,i1) = 1 + i0 + i1 * N0 ; + + Kokkos::Experimental::deep_copy( a , b ); + Kokkos::Experimental::deep_copy( c , a ); + + for ( int i0 = 0 ; i0 < N0 ; ++i0 ) + for ( int i1 = 0 ; i1 < N1 ; ++i1 ) + ASSERT_EQ( b(i0,i1) , c(i0,i1) ); + + Kokkos::Experimental::resize( b , 5 , 6 ); + Kokkos::Experimental::realloc( c , 5 , 6 ); + + ASSERT_EQ( b.dimension_0() , 5 ); + ASSERT_EQ( b.dimension_1() , 6 ); + ASSERT_EQ( c.dimension_0() , 5 ); + ASSERT_EQ( c.dimension_1() , 6 ); + } +} + +template< class ExecSpace > +struct TestViewMappingSubview { + + constexpr static int AN = 10 ; + typedef Kokkos::Experimental::View<int*,ExecSpace> AT ; + typedef Kokkos::Experimental::Subview< AT , true > AS ; + + constexpr static int BN0 = 10 , BN1 = 11 , BN2 = 12 ; + typedef Kokkos::Experimental::View<int***,ExecSpace> BT ; + typedef Kokkos::Experimental::Subview< BT , true , true , true > BS ; + + constexpr static int CN0 = 10 , CN1 = 11 , CN2 = 12 ; + typedef Kokkos::Experimental::View<int***[13][14],ExecSpace> CT ; + typedef Kokkos::Experimental::Subview< CT , true , true , true , false , false > CS ; + + constexpr static int DN0 = 10 , DN1 = 11 , DN2 = 12 ; + typedef Kokkos::Experimental::View<int***[13][14],ExecSpace> DT ; + typedef Kokkos::Experimental::Subview< DT , false , true , true , true , false > DS ; + + + typedef Kokkos::Experimental::View<int***[13][14],Kokkos::LayoutLeft,ExecSpace> DLT ; + typedef Kokkos::Experimental::Subview< DLT , true , false , false , false , false > DLS1 ; + + static_assert( DLS1::rank == 1 && std::is_same< typename DLS1::array_layout , Kokkos::LayoutLeft >::value + , "Subview layout error for rank 1 subview of left-most range of LayoutLeft" ); + + typedef Kokkos::Experimental::View<int***[13][14],Kokkos::LayoutRight,ExecSpace> DRT ; + typedef Kokkos::Experimental::Subview< DRT , false , false , false , false , true > DRS1 ; + + static_assert( DRS1::rank == 1 && std::is_same< typename DRS1::array_layout , Kokkos::LayoutRight >::value + , "Subview layout error for rank 1 subview of right-most range of LayoutRight" ); + + AT Aa ; + AS Ab ; + BT Ba ; + BS Bb ; + CT Ca ; + CS Cb ; + DT Da ; + DS Db ; + + TestViewMappingSubview() + : Aa("Aa",AN) + , Ab( Kokkos::Experimental::subview( Aa , std::pair<int,int>(1,AN-1) ) ) + , Ba("Ba",BN0,BN1,BN2) + , Bb( Kokkos::Experimental::subview( Ba + , std::pair<int,int>(1,BN0-1) + , std::pair<int,int>(1,BN1-1) + , std::pair<int,int>(1,BN2-1) + ) ) + , Ca("Ca",CN0,CN1,CN2) + , Cb( Kokkos::Experimental::subview( Ca + , std::pair<int,int>(1,CN0-1) + , std::pair<int,int>(1,CN1-1) + , std::pair<int,int>(1,CN2-1) + , 1 + , 2 + ) ) + , Da("Da",DN0,DN1,DN2) + , Db( Kokkos::Experimental::subview( Da + , 1 + , std::pair<int,int>(1,DN0-1) + , std::pair<int,int>(1,DN1-1) + , std::pair<int,int>(1,DN2-1) + , 2 + ) ) + { + } + + + KOKKOS_INLINE_FUNCTION + void operator()( const int , long & error_count ) const + { + for ( int i = 1 ; i < AN-1 ; ++i ) if( & Aa[i] != & Ab[i-1] ) ++error_count ; + + for ( int i2 = 1 ; i2 < BN2-1 ; ++i2 ) { + for ( int i1 = 1 ; i1 < BN1-1 ; ++i1 ) { + for ( int i0 = 1 ; i0 < BN0-1 ; ++i0 ) { + if ( & Ba(i0,i1,i2) != & Bb(i0-1,i1-1,i2-1) ) ++error_count ; + }}} + + for ( int i2 = 1 ; i2 < CN2-1 ; ++i2 ) { + for ( int i1 = 1 ; i1 < CN1-1 ; ++i1 ) { + for ( int i0 = 1 ; i0 < CN0-1 ; ++i0 ) { + if ( & Ca(i0,i1,i2,1,2) != & Cb(i0-1,i1-1,i2-1) ) ++error_count ; + }}} + + for ( int i2 = 1 ; i2 < DN2-1 ; ++i2 ) { + for ( int i1 = 1 ; i1 < DN1-1 ; ++i1 ) { + for ( int i0 = 1 ; i0 < DN0-1 ; ++i0 ) { + if ( & Da(1,i0,i1,i2,2) != & Db(i0-1,i1-1,i2-1) ) ++error_count ; + }}} + } + + static void run() + { + TestViewMappingSubview self ; + + ASSERT_EQ( self.Da.stride_1() , self.Db.stride_0() ); + ASSERT_EQ( self.Da.stride_2() , self.Db.stride_1() ); + ASSERT_EQ( self.Da.stride_3() , self.Db.stride_2() ); + + long error_count = -1 ; + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >(0,1) , self , error_count ); + ASSERT_EQ( error_count , 0 ); + } + +}; + +template< class ExecSpace > +void test_view_mapping_subview() +{ + TestViewMappingSubview< ExecSpace >::run(); +} + +/*--------------------------------------------------------------------------*/ + +template< class ViewType > +struct TestViewMapOperator { + + static_assert( ViewType::reference_type_is_lvalue_reference + , "Test only valid for lvalue reference type" ); + + const ViewType v ; + + KOKKOS_INLINE_FUNCTION + void test_left( size_t i0 , long & error_count ) const + { + typename ViewType::value_type * const base_ptr = & v(0,0,0,0,0,0,0,0); + const size_t n1 = v.dimension_1(); + const size_t n2 = v.dimension_2(); + const size_t n3 = v.dimension_3(); + const size_t n4 = v.dimension_4(); + const size_t n5 = v.dimension_5(); + const size_t n6 = v.dimension_6(); + const size_t n7 = v.dimension_7(); + + long offset = 0 ; + + for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) + for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) + for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) + for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) + for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) + { + const long d = & v(i0,i1,i2,i3,i4,i5,i6,i7) - base_ptr ; + if ( d < offset ) ++error_count ; + offset = d ; + } + + if ( v.span() <= size_t(offset) ) ++error_count ; + } + + KOKKOS_INLINE_FUNCTION + void test_right( size_t i0 , long & error_count ) const + { + typename ViewType::value_type * const base_ptr = & v(0,0,0,0,0,0,0,0); + const size_t n1 = v.dimension_1(); + const size_t n2 = v.dimension_2(); + const size_t n3 = v.dimension_3(); + const size_t n4 = v.dimension_4(); + const size_t n5 = v.dimension_5(); + const size_t n6 = v.dimension_6(); + const size_t n7 = v.dimension_7(); + + long offset = 0 ; + + for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) + for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) + for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) + for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) + for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) + for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) + for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) + { + const long d = & v(i0,i1,i2,i3,i4,i5,i6,i7) - base_ptr ; + if ( d < offset ) ++error_count ; + offset = d ; + } + + if ( v.span() <= size_t(offset) ) ++error_count ; + } + + KOKKOS_INLINE_FUNCTION + void operator()( size_t i , long & error_count ) const + { + if ( std::is_same< typename ViewType::array_layout , Kokkos::LayoutLeft >::value ) + test_left(i,error_count); + else if ( std::is_same< typename ViewType::array_layout , Kokkos::LayoutRight >::value ) + test_right(i,error_count); + } + + constexpr static size_t N0 = 10 ; + constexpr static size_t N1 = 9 ; + constexpr static size_t N2 = 8 ; + constexpr static size_t N3 = 7 ; + constexpr static size_t N4 = 6 ; + constexpr static size_t N5 = 5 ; + constexpr static size_t N6 = 4 ; + constexpr static size_t N7 = 3 ; + + TestViewMapOperator() : v( "Test" , N0, N1, N2, N3, N4, N5, N6, N7 ) {} + + static void run() + { + TestViewMapOperator self ; + + ASSERT_EQ( self.v.dimension_0() , ( 0 < ViewType::rank ? N0 : 1 ) ); + ASSERT_EQ( self.v.dimension_1() , ( 1 < ViewType::rank ? N1 : 1 ) ); + ASSERT_EQ( self.v.dimension_2() , ( 2 < ViewType::rank ? N2 : 1 ) ); + ASSERT_EQ( self.v.dimension_3() , ( 3 < ViewType::rank ? N3 : 1 ) ); + ASSERT_EQ( self.v.dimension_4() , ( 4 < ViewType::rank ? N4 : 1 ) ); + ASSERT_EQ( self.v.dimension_5() , ( 5 < ViewType::rank ? N5 : 1 ) ); + ASSERT_EQ( self.v.dimension_6() , ( 6 < ViewType::rank ? N6 : 1 ) ); + ASSERT_EQ( self.v.dimension_7() , ( 7 < ViewType::rank ? N7 : 1 ) ); + + ASSERT_LE( self.v.dimension_0()* + self.v.dimension_1()* + self.v.dimension_2()* + self.v.dimension_3()* + self.v.dimension_4()* + self.v.dimension_5()* + self.v.dimension_6()* + self.v.dimension_7() + , self.v.span() ); + + long error_count ; + Kokkos::RangePolicy< typename ViewType::execution_space > range(0,self.v.dimension_0()); + Kokkos::parallel_reduce( range , self , error_count ); + ASSERT_EQ( 0 , error_count ); + } +}; + + +template< class ExecSpace > +void test_view_mapping_operator() +{ + TestViewMapOperator< Kokkos::Experimental::View<int,Kokkos::LayoutLeft,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int*,Kokkos::LayoutLeft,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int**,Kokkos::LayoutLeft,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int***,Kokkos::LayoutLeft,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int****,Kokkos::LayoutLeft,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int*****,Kokkos::LayoutLeft,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int******,Kokkos::LayoutLeft,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int*******,Kokkos::LayoutLeft,ExecSpace> >::run(); + + TestViewMapOperator< Kokkos::Experimental::View<int,Kokkos::LayoutRight,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int*,Kokkos::LayoutRight,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int**,Kokkos::LayoutRight,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int***,Kokkos::LayoutRight,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int****,Kokkos::LayoutRight,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int*****,Kokkos::LayoutRight,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int******,Kokkos::LayoutRight,ExecSpace> >::run(); + TestViewMapOperator< Kokkos::Experimental::View<int*******,Kokkos::LayoutRight,ExecSpace> >::run(); +} + +/*--------------------------------------------------------------------------*/ + +template< class ExecSpace > +struct TestViewMappingAtomic { + typedef Kokkos::MemoryTraits< Kokkos::Atomic > mem_trait ; + + typedef Kokkos::Experimental::View< int * , ExecSpace > T ; + typedef Kokkos::Experimental::View< int * , ExecSpace , mem_trait > T_atom ; + + T x ; + T_atom x_atom ; + + constexpr static size_t N = 100000 ; + + struct TagInit {}; + struct TagUpdate {}; + struct TagVerify {}; + + KOKKOS_INLINE_FUNCTION + void operator()( const TagInit & , const int i ) const + { x(i) = i ; } + + KOKKOS_INLINE_FUNCTION + void operator()( const TagUpdate & , const int i ) const + { x_atom(i%2) += 1 ; } + + KOKKOS_INLINE_FUNCTION + void operator()( const TagVerify & , const int i , long & error_count ) const + { + if ( i < 2 ) { if ( x(i) != int(i + N / 2) ) ++error_count ; } + else { if ( x(i) != int(i) ) ++error_count ; } + } + + TestViewMappingAtomic() + : x("x",N) + , x_atom( x ) + {} + + static void run() + { + ASSERT_TRUE( T::reference_type_is_lvalue_reference ); + ASSERT_FALSE( T_atom::reference_type_is_lvalue_reference ); + + TestViewMappingAtomic self ; + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace , TagInit >(0,N) , self ); + Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace , TagUpdate >(0,N) , self ); + long error_count = -1 ; + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagVerify >(0,N) , self , error_count ); + ASSERT_EQ( 0 , error_count ); + } +}; + + +} /* namespace Test */ + +/*--------------------------------------------------------------------------*/ + diff --git a/lib/kokkos/core/unit_test/TestViewOfClass.hpp b/lib/kokkos/core/unit_test/TestViewOfClass.hpp new file mode 100755 index 0000000000000000000000000000000000000000..09abacd80de10950f94866a1b0ad368bc9527ce7 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewOfClass.hpp @@ -0,0 +1,126 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +namespace { +volatile int nested_view_count ; +} + +template< class Space > +class NestedView { +private: + Kokkos::View<int*,Space> member ; + +public: + + KOKKOS_INLINE_FUNCTION + NestedView() +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + : member("member",2) + { Kokkos::atomic_increment( & nested_view_count ); } +#else + : member(){} +#endif + + ~NestedView() +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { Kokkos::atomic_decrement( & nested_view_count ); } +#else + {} +#endif + +}; + + +template< class Space > +void view_nested_view() +{ + ASSERT_EQ( 0 , nested_view_count ); + { + Kokkos::View< NestedView<Space> * , Space > a("a_nested_view",2); + ASSERT_EQ( 2 , nested_view_count ); + Kokkos::View< NestedView<Space> * , Space > b("b_nested_view",2); + ASSERT_EQ( 4 , nested_view_count ); + } + // ASSERT_EQ( 0 , nested_view_count ); +} + +} + +namespace Kokkos { +namespace Impl { + +template< class ExecSpace , class S > +struct ViewDefaultConstruct< ExecSpace , Test::NestedView<S> , true > +{ + typedef Test::NestedView<S> type ; + type * const m_ptr ; + + KOKKOS_FORCEINLINE_FUNCTION + void operator()( const typename ExecSpace::size_type& i ) const + { new(m_ptr+i) type(); } + + ViewDefaultConstruct( type * pointer , size_t capacity ) + : m_ptr( pointer ) + { + Kokkos::RangePolicy< ExecSpace > range( 0 , capacity ); + parallel_for( range , *this ); + ExecSpace::fence(); + } +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp new file mode 100755 index 0000000000000000000000000000000000000000..8bf201fb47c41f0d3d2da2007057c5ef2aa54f23 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp @@ -0,0 +1,632 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> + +/*--------------------------------------------------------------------------*/ + +namespace TestViewSubview { + +#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) + +using Kokkos::Experimental::ALL ; + +#else + +namespace { + +const Kokkos::ALL ALL ; + +} + +#endif + +template<class Layout, class Space> +struct getView { + static + Kokkos::View<double**,Layout,Space> get(int n, int m) { + return Kokkos::View<double**,Layout,Space>("G",n,m); + } +}; + +template<class Space> +struct getView<Kokkos::LayoutStride,Space> { + static + Kokkos::View<double**,Kokkos::LayoutStride,Space> get(int n, int m) { + const int rank = 2 ; + const int order[] = { 0, 1 }; + const unsigned dim[] = { unsigned(n), unsigned(m) }; + Kokkos::LayoutStride stride = Kokkos::LayoutStride::order_dimensions( rank , order , dim ); + return Kokkos::View<double**,Kokkos::LayoutStride,Space>("G",stride); + } +}; + +template<class ViewType, class Space> +struct fill_1D { + typedef typename Space::execution_space execution_space; + typedef typename ViewType::size_type size_type; + ViewType a; + double val; + fill_1D(ViewType a_, double val_):a(a_),val(val_) { + } + KOKKOS_INLINE_FUNCTION + void operator() (const int i) const { + a(i) = val; + } +}; + +template<class ViewType, class Space> +struct fill_2D { + typedef typename Space::execution_space execution_space; + typedef typename ViewType::size_type size_type; + ViewType a; + double val; + fill_2D(ViewType a_, double val_):a(a_),val(val_) { + } + KOKKOS_INLINE_FUNCTION + void operator() (const int i) const{ + for(int j = 0; j < static_cast<int>(a.dimension_1()); j++) + a(i,j) = val; + } +}; + +template<class Layout, class Space> +void test_auto_1d () +{ + typedef Kokkos::View<double**, Layout, Space> mv_type; + typedef typename mv_type::size_type size_type; + const double ZERO = 0.0; + const double ONE = 1.0; + const double TWO = 2.0; + + const size_type numRows = 10; + const size_type numCols = 3; + + mv_type X = getView<Layout,Space>::get(numRows, numCols); + typename mv_type::HostMirror X_h = Kokkos::create_mirror_view (X); + + fill_2D<mv_type,Space> f1(X, ONE); + Kokkos::parallel_for(X.dimension_0(),f1); + Kokkos::deep_copy (X_h, X); + for (size_type j = 0; j < numCols; ++j) { + for (size_type i = 0; i < numRows; ++i) { + ASSERT_TRUE(X_h(i,j) == ONE); + } + } + + fill_2D<mv_type,Space> f2(X, 0.0); + Kokkos::parallel_for(X.dimension_0(),f2); + Kokkos::deep_copy (X_h, X); + for (size_type j = 0; j < numCols; ++j) { + for (size_type i = 0; i < numRows; ++i) { + ASSERT_TRUE(X_h(i,j) == ZERO); + } + } + + fill_2D<mv_type,Space> f3(X, TWO); + Kokkos::parallel_for(X.dimension_0(),f3); + Kokkos::deep_copy (X_h, X); + for (size_type j = 0; j < numCols; ++j) { + for (size_type i = 0; i < numRows; ++i) { + ASSERT_TRUE(X_h(i,j) == TWO); + } + } + + for (size_type j = 0; j < numCols; ++j) { + auto X_j = Kokkos::subview (X, TestViewSubview::ALL, j); + + fill_1D<decltype(X_j),Space> f4(X_j, ZERO); + Kokkos::parallel_for(X_j.dimension_0(),f4); + Kokkos::deep_copy (X_h, X); + for (size_type i = 0; i < numRows; ++i) { + ASSERT_TRUE(X_h(i,j) == ZERO); + } + + for (size_type jj = 0; jj < numCols; ++jj) { + auto X_jj = Kokkos::subview (X, TestViewSubview::ALL, jj); + fill_1D<decltype(X_jj),Space> f5(X_jj, ONE); + Kokkos::parallel_for(X_jj.dimension_0(),f5); + Kokkos::deep_copy (X_h, X); + for (size_type i = 0; i < numRows; ++i) { + ASSERT_TRUE(X_h(i,jj) == ONE); + } + } + } +} + +template<class LD, class LS, class Space> +void test_1d_strided_assignment_impl(bool a, bool b, bool c, bool d, int n, int m) { + Kokkos::View<double**,LS,Space> l2d("l2d",n,m); + + int col = n>2?2:0; + int row = m>2?2:0; + + if(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,Space>::value) { + if(a) { + Kokkos::View<double*,LD,Space> l1da = Kokkos::subview(l2d,TestViewSubview::ALL,row); + ASSERT_TRUE( & l1da(0) == & l2d(0,row) ); + if(n>1) + ASSERT_TRUE( & l1da(1) == & l2d(1,row) ); + } + if(b && n>13) { + Kokkos::View<double*,LD,Space> l1db = Kokkos::subview(l2d,std::pair<unsigned,unsigned>(2,13),row); + ASSERT_TRUE( & l1db(0) == & l2d(2,row) ); + ASSERT_TRUE( & l1db(1) == & l2d(3,row) ); + } + if(c) { + Kokkos::View<double*,LD,Space> l1dc = Kokkos::subview(l2d,col,TestViewSubview::ALL); + ASSERT_TRUE( & l1dc(0) == & l2d(col,0) ); + if(m>1) + ASSERT_TRUE( & l1dc(1) == & l2d(col,1) ); + } + if(d && m>13) { + Kokkos::View<double*,LD,Space> l1dd = Kokkos::subview(l2d,col,std::pair<unsigned,unsigned>(2,13)); + ASSERT_TRUE( & l1dd(0) == & l2d(col,2) ); + ASSERT_TRUE( & l1dd(1) == & l2d(col,3) ); + } + } + +} + +template<class Space > +void test_1d_strided_assignment() { + test_1d_strided_assignment_impl<Kokkos::LayoutStride,Kokkos::LayoutLeft,Space>(true,true,true,true,17,3); + test_1d_strided_assignment_impl<Kokkos::LayoutStride,Kokkos::LayoutRight,Space>(true,true,true,true,17,3); + + test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,false,false,17,3); + test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,false,false,17,3); + test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(false,false,true,true,17,3); + test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(false,false,true,true,17,3); + + test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,false,false,17,1); + test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,true,true,1,17); + test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,true,true,1,17); + test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,false,false,17,1); + + test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(true,true,true,true,17,1); + test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(false,false,true,true,1,17); + test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(false,false,true,true,1,17); + test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(true,true,true,true,17,1); +} + +template< class Space > +void test_left_0() +{ + typedef Kokkos::View< int [2][3][4][5][2][3][4][5] , Kokkos::LayoutLeft , Space > + view_static_8_type ; + + view_static_8_type x_static_8("x_static_left_8"); + + ASSERT_TRUE( x_static_8.is_contiguous() ); + + Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x_static_8 , 0, 0, 0, 0, 0, 0, 0, 0 ); + + ASSERT_TRUE( x0.is_contiguous() ); + ASSERT_TRUE( & x0() == & x_static_8(0,0,0,0,0,0,0,0) ); + + Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 = + Kokkos::subview( x_static_8, Kokkos::pair<int,int>(0,2), 1, 2, 3, 0, 1, 2, 3 ); + + ASSERT_TRUE( x1.is_contiguous() ); + ASSERT_TRUE( & x1(0) == & x_static_8(0,1,2,3,0,1,2,3) ); + ASSERT_TRUE( & x1(1) == & x_static_8(1,1,2,3,0,1,2,3) ); + + Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 = + Kokkos::subview( x_static_8, Kokkos::pair<int,int>(0,2), 1, 2, 3 + , Kokkos::pair<int,int>(0,2), 1, 2, 3 ); + + ASSERT_TRUE( ! x2.is_contiguous() ); + ASSERT_TRUE( & x2(0,0) == & x_static_8(0,1,2,3,0,1,2,3) ); + ASSERT_TRUE( & x2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) ); + ASSERT_TRUE( & x2(0,1) == & x_static_8(0,1,2,3,1,1,2,3) ); + ASSERT_TRUE( & x2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) ); + + // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 = + Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 = + Kokkos::subview( x_static_8, 1, Kokkos::pair<int,int>(0,2), 2, 3 + , Kokkos::pair<int,int>(0,2), 1, 2, 3 ); + + ASSERT_TRUE( ! sx2.is_contiguous() ); + ASSERT_TRUE( & sx2(0,0) == & x_static_8(1,0,2,3,0,1,2,3) ); + ASSERT_TRUE( & sx2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) ); + ASSERT_TRUE( & sx2(0,1) == & x_static_8(1,0,2,3,1,1,2,3) ); + ASSERT_TRUE( & sx2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) ); + + Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 = + Kokkos::subview( x_static_8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */ + , 1, Kokkos::pair<int,int>(1,3) /* of [5] */ + , 1, Kokkos::pair<int,int>(0,2) /* of [3] */ + , 2, Kokkos::pair<int,int>(2,4) /* of [5] */ + ); + + ASSERT_TRUE( ! sx4.is_contiguous() ); + + for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 ) + for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 ) + for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 ) + for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) { + ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x_static_8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) ); + } +} + +template< class Space > +void test_left_1() +{ + typedef Kokkos::View< int ****[2][3][4][5] , Kokkos::LayoutLeft , Space > + view_type ; + + view_type x8("x_left_8",2,3,4,5); + + ASSERT_TRUE( x8.is_contiguous() ); + + Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x8 , 0, 0, 0, 0, 0, 0, 0, 0 ); + + ASSERT_TRUE( x0.is_contiguous() ); + ASSERT_TRUE( & x0() == & x8(0,0,0,0,0,0,0,0) ); + + Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 = + Kokkos::subview( x8, Kokkos::pair<int,int>(0,2), 1, 2, 3, 0, 1, 2, 3 ); + + ASSERT_TRUE( x1.is_contiguous() ); + ASSERT_TRUE( & x1(0) == & x8(0,1,2,3,0,1,2,3) ); + ASSERT_TRUE( & x1(1) == & x8(1,1,2,3,0,1,2,3) ); + + Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 = + Kokkos::subview( x8, Kokkos::pair<int,int>(0,2), 1, 2, 3 + , Kokkos::pair<int,int>(0,2), 1, 2, 3 ); + + ASSERT_TRUE( ! x2.is_contiguous() ); + ASSERT_TRUE( & x2(0,0) == & x8(0,1,2,3,0,1,2,3) ); + ASSERT_TRUE( & x2(1,0) == & x8(1,1,2,3,0,1,2,3) ); + ASSERT_TRUE( & x2(0,1) == & x8(0,1,2,3,1,1,2,3) ); + ASSERT_TRUE( & x2(1,1) == & x8(1,1,2,3,1,1,2,3) ); + + // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 = + Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 = + Kokkos::subview( x8, 1, Kokkos::pair<int,int>(0,2), 2, 3 + , Kokkos::pair<int,int>(0,2), 1, 2, 3 ); + + ASSERT_TRUE( ! sx2.is_contiguous() ); + ASSERT_TRUE( & sx2(0,0) == & x8(1,0,2,3,0,1,2,3) ); + ASSERT_TRUE( & sx2(1,0) == & x8(1,1,2,3,0,1,2,3) ); + ASSERT_TRUE( & sx2(0,1) == & x8(1,0,2,3,1,1,2,3) ); + ASSERT_TRUE( & sx2(1,1) == & x8(1,1,2,3,1,1,2,3) ); + + Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 = + Kokkos::subview( x8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */ + , 1, Kokkos::pair<int,int>(1,3) /* of [5] */ + , 1, Kokkos::pair<int,int>(0,2) /* of [3] */ + , 2, Kokkos::pair<int,int>(2,4) /* of [5] */ + ); + + ASSERT_TRUE( ! sx4.is_contiguous() ); + + for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 ) + for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 ) + for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 ) + for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) { + ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) ); + } +} + +template< class Space > +void test_left_2() +{ + typedef Kokkos::View< int **** , Kokkos::LayoutLeft , Space > view_type ; + + view_type x4("x4",2,3,4,5); + + ASSERT_TRUE( x4.is_contiguous() ); + + Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x4 , 0, 0, 0, 0 ); + + ASSERT_TRUE( x0.is_contiguous() ); + ASSERT_TRUE( & x0() == & x4(0,0,0,0) ); + + Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 = + Kokkos::subview( x4, Kokkos::pair<int,int>(0,2), 1, 2, 3 ); + + ASSERT_TRUE( x1.is_contiguous() ); + ASSERT_TRUE( & x1(0) == & x4(0,1,2,3) ); + ASSERT_TRUE( & x1(1) == & x4(1,1,2,3) ); + + Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 = + Kokkos::subview( x4, Kokkos::pair<int,int>(0,2), 1, Kokkos::pair<int,int>(1,3), 2 ); + + ASSERT_TRUE( ! x2.is_contiguous() ); + ASSERT_TRUE( & x2(0,0) == & x4(0,1,1,2) ); + ASSERT_TRUE( & x2(1,0) == & x4(1,1,1,2) ); + ASSERT_TRUE( & x2(0,1) == & x4(0,1,2,2) ); + ASSERT_TRUE( & x2(1,1) == & x4(1,1,2,2) ); + + // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 = + Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 = + Kokkos::subview( x4, 1, Kokkos::pair<int,int>(0,2) + , 2, Kokkos::pair<int,int>(1,4) ); + + ASSERT_TRUE( ! sx2.is_contiguous() ); + ASSERT_TRUE( & sx2(0,0) == & x4(1,0,2,1) ); + ASSERT_TRUE( & sx2(1,0) == & x4(1,1,2,1) ); + ASSERT_TRUE( & sx2(0,1) == & x4(1,0,2,2) ); + ASSERT_TRUE( & sx2(1,1) == & x4(1,1,2,2) ); + ASSERT_TRUE( & sx2(0,2) == & x4(1,0,2,3) ); + ASSERT_TRUE( & sx2(1,2) == & x4(1,1,2,3) ); + + Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 = + Kokkos::subview( x4, Kokkos::pair<int,int>(1,2) /* of [2] */ + , Kokkos::pair<int,int>(1,3) /* of [3] */ + , Kokkos::pair<int,int>(0,4) /* of [4] */ + , Kokkos::pair<int,int>(2,4) /* of [5] */ + ); + + ASSERT_TRUE( ! sx4.is_contiguous() ); + + for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 ) + for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 ) + for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 ) + for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) { + ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x4( 1+i0, 1+i1, 0+i2, 2+i3 ) ); + } +} + +template< class Space > +void test_left_3() +{ + typedef Kokkos::View< int ** , Kokkos::LayoutLeft , Space > view_type ; + + view_type xm("x4",10,5); + + ASSERT_TRUE( xm.is_contiguous() ); + + Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( xm , 5, 3 ); + + ASSERT_TRUE( x0.is_contiguous() ); + ASSERT_TRUE( & x0() == & xm(5,3) ); + + Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 = + Kokkos::subview( xm, TestViewSubview::ALL, 3 ); + + ASSERT_TRUE( x1.is_contiguous() ); + for ( int i = 0 ; i < int(xm.dimension_0()) ; ++i ) { + ASSERT_TRUE( & x1(i) == & xm(i,3) ); + } + + Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 = + Kokkos::subview( xm, Kokkos::pair<int,int>(1,9), TestViewSubview::ALL ); + + ASSERT_TRUE( ! x2.is_contiguous() ); + for ( int j = 0 ; j < int(x2.dimension_1()) ; ++j ) + for ( int i = 0 ; i < int(x2.dimension_0()) ; ++i ) { + ASSERT_TRUE( & x2(i,j) == & xm(1+i,j) ); + } + + Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2c = + Kokkos::subview( xm, TestViewSubview::ALL, std::pair<int,int>(2,4) ); + + ASSERT_TRUE( x2c.is_contiguous() ); + for ( int j = 0 ; j < int(x2c.dimension_1()) ; ++j ) + for ( int i = 0 ; i < int(x2c.dimension_0()) ; ++i ) { + ASSERT_TRUE( & x2c(i,j) == & xm(i,2+j) ); + } + + Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2_n1 = + Kokkos::subview( xm , std::pair<int,int>(1,1) , TestViewSubview::ALL ); + + ASSERT_TRUE( x2_n1.dimension_0() == 0 ); + ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() ); + + Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2_n2 = + Kokkos::subview( xm , TestViewSubview::ALL , std::pair<int,int>(1,1) ); + + ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() ); + ASSERT_TRUE( x2_n2.dimension_1() == 0 ); +} + +//---------------------------------------------------------------------------- + +template< class Space > +void test_right_0() +{ + typedef Kokkos::View< int [2][3][4][5][2][3][4][5] , Kokkos::LayoutRight , Space > + view_static_8_type ; + + view_static_8_type x_static_8("x_static_right_8"); + + Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( x_static_8 , 0, 0, 0, 0, 0, 0, 0, 0 ); + + ASSERT_TRUE( & x0() == & x_static_8(0,0,0,0,0,0,0,0) ); + + Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 = + Kokkos::subview( x_static_8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair<int,int>(1,3) ); + + ASSERT_TRUE( & x1(0) == & x_static_8(0,1,2,3,0,1,2,1) ); + ASSERT_TRUE( & x1(1) == & x_static_8(0,1,2,3,0,1,2,2) ); + + Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 = + Kokkos::subview( x_static_8, 0, 1, 2, Kokkos::pair<int,int>(1,3) + , 0, 1, 2, Kokkos::pair<int,int>(1,3) ); + + ASSERT_TRUE( & x2(0,0) == & x_static_8(0,1,2,1,0,1,2,1) ); + ASSERT_TRUE( & x2(1,0) == & x_static_8(0,1,2,2,0,1,2,1) ); + ASSERT_TRUE( & x2(0,1) == & x_static_8(0,1,2,1,0,1,2,2) ); + ASSERT_TRUE( & x2(1,1) == & x_static_8(0,1,2,2,0,1,2,2) ); + + // Kokkos::View<int**,Kokkos::LayoutRight,Space> error_2 = + Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 = + Kokkos::subview( x_static_8, 1, Kokkos::pair<int,int>(0,2), 2, 3 + , Kokkos::pair<int,int>(0,2), 1, 2, 3 ); + + ASSERT_TRUE( & sx2(0,0) == & x_static_8(1,0,2,3,0,1,2,3) ); + ASSERT_TRUE( & sx2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) ); + ASSERT_TRUE( & sx2(0,1) == & x_static_8(1,0,2,3,1,1,2,3) ); + ASSERT_TRUE( & sx2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) ); + + Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 = + Kokkos::subview( x_static_8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */ + , 1, Kokkos::pair<int,int>(1,3) /* of [5] */ + , 1, Kokkos::pair<int,int>(0,2) /* of [3] */ + , 2, Kokkos::pair<int,int>(2,4) /* of [5] */ + ); + + for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 ) + for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 ) + for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 ) + for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) { + ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x_static_8(0, 0+i0, 1, 1+i1, 1, 0+i2, 2, 2+i3) ); + } +} + +template< class Space > +void test_right_1() +{ + typedef Kokkos::View< int ****[2][3][4][5] , Kokkos::LayoutRight , Space > + view_type ; + + view_type x8("x_right_8",2,3,4,5); + + Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( x8 , 0, 0, 0, 0, 0, 0, 0, 0 ); + + ASSERT_TRUE( & x0() == & x8(0,0,0,0,0,0,0,0) ); + + Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 = + Kokkos::subview( x8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair<int,int>(1,3) ); + + ASSERT_TRUE( & x1(0) == & x8(0,1,2,3,0,1,2,1) ); + ASSERT_TRUE( & x1(1) == & x8(0,1,2,3,0,1,2,2) ); + + Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 = + Kokkos::subview( x8, 0, 1, 2, Kokkos::pair<int,int>(1,3) + , 0, 1, 2, Kokkos::pair<int,int>(1,3) ); + + ASSERT_TRUE( & x2(0,0) == & x8(0,1,2,1,0,1,2,1) ); + ASSERT_TRUE( & x2(1,0) == & x8(0,1,2,2,0,1,2,1) ); + ASSERT_TRUE( & x2(0,1) == & x8(0,1,2,1,0,1,2,2) ); + ASSERT_TRUE( & x2(1,1) == & x8(0,1,2,2,0,1,2,2) ); + + // Kokkos::View<int**,Kokkos::LayoutRight,Space> error_2 = + Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 = + Kokkos::subview( x8, 1, Kokkos::pair<int,int>(0,2), 2, 3 + , Kokkos::pair<int,int>(0,2), 1, 2, 3 ); + + ASSERT_TRUE( & sx2(0,0) == & x8(1,0,2,3,0,1,2,3) ); + ASSERT_TRUE( & sx2(1,0) == & x8(1,1,2,3,0,1,2,3) ); + ASSERT_TRUE( & sx2(0,1) == & x8(1,0,2,3,1,1,2,3) ); + ASSERT_TRUE( & sx2(1,1) == & x8(1,1,2,3,1,1,2,3) ); + + Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 = + Kokkos::subview( x8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */ + , 1, Kokkos::pair<int,int>(1,3) /* of [5] */ + , 1, Kokkos::pair<int,int>(0,2) /* of [3] */ + , 2, Kokkos::pair<int,int>(2,4) /* of [5] */ + ); + + for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 ) + for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 ) + for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 ) + for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) { + ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) ); + } +} + +template< class Space > +void test_right_3() +{ + typedef Kokkos::View< int ** , Kokkos::LayoutRight , Space > view_type ; + + view_type xm("x4",10,5); + + ASSERT_TRUE( xm.is_contiguous() ); + + Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( xm , 5, 3 ); + + ASSERT_TRUE( x0.is_contiguous() ); + ASSERT_TRUE( & x0() == & xm(5,3) ); + + Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 = + Kokkos::subview( xm, 3, TestViewSubview::ALL ); + + ASSERT_TRUE( x1.is_contiguous() ); + for ( int i = 0 ; i < int(xm.dimension_1()) ; ++i ) { + ASSERT_TRUE( & x1(i) == & xm(3,i) ); + } + + Kokkos::View<int**,Kokkos::LayoutRight,Space> x2c = + Kokkos::subview( xm, Kokkos::pair<int,int>(1,9), TestViewSubview::ALL ); + + ASSERT_TRUE( x2c.is_contiguous() ); + for ( int j = 0 ; j < int(x2c.dimension_1()) ; ++j ) + for ( int i = 0 ; i < int(x2c.dimension_0()) ; ++i ) { + ASSERT_TRUE( & x2c(i,j) == & xm(1+i,j) ); + } + + Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 = + Kokkos::subview( xm, TestViewSubview::ALL, std::pair<int,int>(2,4) ); + + ASSERT_TRUE( ! x2.is_contiguous() ); + for ( int j = 0 ; j < int(x2.dimension_1()) ; ++j ) + for ( int i = 0 ; i < int(x2.dimension_0()) ; ++i ) { + ASSERT_TRUE( & x2(i,j) == & xm(i,2+j) ); + } + + Kokkos::View<int**,Kokkos::LayoutRight,Space> x2_n1 = + Kokkos::subview( xm , std::pair<int,int>(1,1) , TestViewSubview::ALL ); + + ASSERT_TRUE( x2_n1.dimension_0() == 0 ); + ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() ); + + Kokkos::View<int**,Kokkos::LayoutRight,Space> x2_n2 = + Kokkos::subview( xm , TestViewSubview::ALL , std::pair<int,int>(1,1) ); + + ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() ); + ASSERT_TRUE( x2_n2.dimension_1() == 0 ); +} + +//---------------------------------------------------------------------------- + +} + diff --git a/lib/kokkos/core/unit_test/UnitTestMain.cpp b/lib/kokkos/core/unit_test/UnitTestMain.cpp new file mode 100755 index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87 --- /dev/null +++ b/lib/kokkos/core/unit_test/UnitTestMain.cpp @@ -0,0 +1,50 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +int main(int argc, char *argv[]) { + ::testing::InitGoogleTest(&argc,argv); + return RUN_ALL_TESTS(); +} + diff --git a/lib/kokkos/doc/Doxyfile b/lib/kokkos/doc/Doxyfile new file mode 100755 index 0000000000000000000000000000000000000000..bc5c7486b27fc55ede35359b969af0a8008f960b --- /dev/null +++ b/lib/kokkos/doc/Doxyfile @@ -0,0 +1,127 @@ +# +# Include the global look and feel options +# +@INCLUDE = ../../common/Doxyfile +# +# Package options +# +PROJECT_NAME = "Kokkos Core Kernels Package" +PROJECT_NUMBER = "Version of the Day" +OUTPUT_DIRECTORY = . +OUTPUT_LANGUAGE = English + +EXTRACT_ALL = NO +EXTRACT_PRIVATE = NO +EXTRACT_STATIC = YES +HIDE_UNDOC_MEMBERS = YES +HIDE_UNDOC_CLASSES = YES +BRIEF_MEMBER_DESC = YES +REPEAT_BRIEF = YES +ALWAYS_DETAILED_SEC = YES +FULL_PATH_NAMES = NO +STRIP_FROM_PATH = +INTERNAL_DOCS = NO +CLASS_DIAGRAMS = YES +SOURCE_BROWSER = YES +INLINE_SOURCES = NO +STRIP_CODE_COMMENTS = YES +REFERENCED_BY_RELATION = NO +REFERENCES_RELATION = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +VERBATIM_HEADERS = YES +SHOW_INCLUDE_FILES = YES +#JAVADOC_AUTOBRIEF = YES +INHERIT_DOCS = YES +INLINE_INHERITED_MEMB = YES +INLINE_INFO = YES +SORT_MEMBER_DOCS = NO +TAB_SIZE = 2 +ENABLED_SECTIONS = +SORT_BRIEF_DOCS = NO +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +QUIET = NO +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_FORMAT = "$file:$line: $text" + +# +# INPUT: Where to find files that Doxygen should process. ../classic +# has a doc/ subdirectory with its own Doxyfile that points to its own +# files. The other Kokkos subpackages don't currently have their own +# Doxyfile files, so we have to do it manually here. +# +# mfh 26 Sep 2013: I've only added those directories in the Core +# subpackage that constitute the "public interface" of that +# subpackage. Please feel free to include additional subdirectories +# of ../core if you want to generate their documentation as well. +# +# mfh 26 Sep 2013: I've only added the Kokkos subpackages here that I +# think are ready for Doxygen documentation generation. Please feel +# free to amend this list as you see fit. +# + +INPUT = index.doc ../classic ../core/src ../containers/src ../linalg/src +FILE_PATTERNS = *.hpp *.cpp *.cuh *.cu +RECURSIVE = NO +EXCLUDE_PATTERNS = *.x *.o *.out +EXAMPLE_PATH = +EXAMPLE_RECURSIVE = YES +EXAMPLE_PATTERNS = *.cpp *.hpp +IMAGE_PATH = +INPUT_FILTER = +ALPHABETICAL_INDEX = YES +COLS_IN_ALPHA_INDEX = 4 +IGNORE_PREFIX = +# +# What diagrams are created +# +CLASS_GRAPH = YES +COLLABORATION_GRAPH = NO +INCLUDE_GRAPH = NO +INCLUDED_BY_GRAPH = NO +GRAPHICAL_HIERARCHY = YES +# +# Preprocessing +# +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = YES +EXPAND_ONLY_PREDEF = YES +SEARCH_INCLUDES = YES +INCLUDE_FILE_PATTERNS = +PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS DOXYGEN_USE_ONLY +INCLUDE_PATH = ../src +EXPAND_AS_DEFINED = +# +# Links to other packages +# +TAGFILES = ../../common/tag_files/teuchos.tag=../../../teuchos/doc/html ../../common/tag_files/epetra.tag=../../../epetra/doc/html \ + ../../common/tag_files/belos.tag=../../../belos/doc/html ../../common/tag_files/anasazi.tag=../../../anasazi/doc/html \ + ../../common/tag_files/kokkos.tag=../../../kokkos/doc/html +GENERATE_TAGFILE = ../../common/tag_files/tpetra.tag +ALLEXTERNALS = NO +EXTERNAL_GROUPS = NO +# +# Environment +# +PERL_PATH = /usr/bin/perl +HAVE_DOT = YES +DOT_PATH = +MAX_DOT_GRAPH_WIDTH = 1024 +MAX_DOT_GRAPH_HEIGHT = 1024 +# +# What kind of documentation is generated +# +#GENERATE_HTML = YES +#HTML_OUTPUT = html +#HTML_HEADER = includes/header.html +#HTML_FOOTER = includes/footer.html +#HTML_STYLESHEET = includes/stylesheet.css +#HTML_ALIGN_MEMBERS = YES +GENERATE_HTMLHELP = NO +DISABLE_INDEX = NO +GENERATE_LATEX = NO +GENERATE_RTF = NO +GENERATE_MAN = NO +GENERATE_XML = NO diff --git a/lib/kokkos/doc/Kokkos_PG.pdf b/lib/kokkos/doc/Kokkos_PG.pdf new file mode 100755 index 0000000000000000000000000000000000000000..3c415698c0d9fec315f317b71db19f2a019b6f6e Binary files /dev/null and b/lib/kokkos/doc/Kokkos_PG.pdf differ diff --git a/lib/kokkos/doc/README b/lib/kokkos/doc/README new file mode 100755 index 0000000000000000000000000000000000000000..31e75f365c21a116a1fb736097f4f524e8d1e021 --- /dev/null +++ b/lib/kokkos/doc/README @@ -0,0 +1,32 @@ +Kokkos uses the Doxygen tool for providing three documentation +sources: +- man pages +- Latex User Guide +- HTML Online User Guide. + +Man Pages + +Man pages are available for all files and functions in the directory +TRILINOS_HOME/doc/kokkos/man, where TRILINOS_HOME is the location of your +copy of Trilinos. To use these pages with the Unix man utility, add +the directory to your man path as follows: + +setenv MANPATH `echo $MANPATH`:TRILINOS_HOME/doc/kokkos/man + + +LaTeX User Guide + +A postscript version of this guide is in +TRILINOS_HOME/doc/kokkos/latex/user_guide.ps. The LaTeX source is in the +directory TRILINOS_HOME/doc/kokkos/latex. + +HTML Online User Guide + +The online guide is initiated by pointing your browser to +TRILINOS_HOME/doc/kokkos/html/index.html + +Any question, comments or suggestions are welcome. Please send to +Mike Heroux at + +320-845-7695 +maherou@sandia.gov diff --git a/lib/kokkos/doc/build_docs b/lib/kokkos/doc/build_docs new file mode 100755 index 0000000000000000000000000000000000000000..da1d3e4f6e061804b1fb2fe21b356b691494df5d --- /dev/null +++ b/lib/kokkos/doc/build_docs @@ -0,0 +1,15 @@ +#!/bin/sh + +if [ $TRILINOS_HOME ]; then + echo "TRILINOS_HOME has already been set!" +else + echo "TRILINOS_HOME has not been set. Setting it!" + export TRILINOS_HOME=`pwd`/../../.. +fi + +echo +echo "Generating main Kokkos doxygen documentation ..." +echo + +doxygen Doxyfile + diff --git a/lib/kokkos/doc/index.doc b/lib/kokkos/doc/index.doc new file mode 100755 index 0000000000000000000000000000000000000000..27a9e4f2e7b90e11bbcde7309e9bf1544e3b386f --- /dev/null +++ b/lib/kokkos/doc/index.doc @@ -0,0 +1,72 @@ +/*! +\mainpage Trilinos/Kokkos: Shared-memory programming interface and computational kernels + +\section Kokkos_Intro Introduction + +The %Kokkos package has two main components. The first, sometimes +called "%Kokkos Array" or just "%Kokkos," implements a +performance-portable shared-memory parallel programming model and data +containers. The second, called "%Kokkos Classic," consists of +computational kernels that support the %Tpetra package. + +\section Kokkos_Kokkos The %Kokkos programming model + +%Kokkos implements a performance-portable shared-memory parallel +programming model and data containers. It lets you write an algorithm +once, and just change a template parameter to get the optimal data +layout for your hardware. %Kokkos has back-ends for the following +parallel programming models: + +- Kokkos::Threads: POSIX Threads (Pthreads) +- Kokkos::OpenMP: OpenMP +- Kokkos::Cuda: NVIDIA's CUDA programming model for graphics + processing units (GPUs) +- Kokkos::Serial: No thread parallelism + +%Kokkos also has optimizations for shared-memory parallel systems with +nonuniform memory access (NUMA). Its containers can hold data of any +primitive ("plain old") data type (and some aggregate types). %Kokkos +Array may be used as a stand-alone programming model. + +%Kokkos' parallel operations include the following: + +- parallel_for: a thread-parallel "for loop" +- parallel_reduce: a thread-parallel reduction +- parallel_scan: a thread-parallel prefix scan operation + +as well as expert-level platform-independent interfaces to thread +"teams," per-team "shared memory," synchronization, and atomic update +operations. + +%Kokkos' data containers include the following: + +- Kokkos::View: A multidimensional array suitable for thread-parallel + operations. Its layout (e.g., row-major or column-major) is + optimized by default for the particular thread-parallel device. +- Kokkos::Vector: A drop-in replacement for std::vector that eases + porting from standard sequential C++ data structures to %Kokkos' + parallel data structures. +- Kokkos::UnorderedMap: A parallel lookup table comparable in + functionality to std::unordered_map. + +%Kokkos also uses the above basic containers to implement higher-level +data structures, like sparse graphs and matrices. + +A good place to start learning about %Kokkos would be <a href="http://trilinos.sandia.gov/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf">these tutorial slides</a> from the 2013 Trilinos Users' Group meeting. + +\section Kokkos_Classic %Kokkos Classic + +"%Kokkos Classic" consists of computational kernels that support the +%Tpetra package. These kernels include sparse matrix-vector multiply, +sparse triangular solve, Gauss-Seidel, and dense vector operations. +They are templated on the type of objects (\c Scalar) on which they +operate. This component was not meant to be visible to users; it is +an implementation detail of the %Tpetra distributed linear algebra +package. + +%Kokkos Classic also implements a shared-memory parallel programming +model. This inspired and preceded the %Kokkos programming model +described in the previous section. Users should consider the %Kokkos +Classic programming model deprecated, and prefer the new %Kokkos +programming model. +*/ diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash new file mode 100755 index 0000000000000000000000000000000000000000..2e595dcc1c9f333ba84e38442a90c120625c949c --- /dev/null +++ b/lib/kokkos/generate_makefile.bash @@ -0,0 +1,204 @@ +#!/bin/bash + +KOKKOS_DEVICES="" + +while [[ $# > 0 ]] +do +key="$1" + +case $key in + --kokkos-path*) + KOKKOS_PATH="${key#*=}" + ;; + --prefix*) + PREFIX="${key#*=}" + ;; + --with-cuda) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda" + CUDA_PATH_NVCC=`which nvcc` + CUDA_PATH=${CUDA_PATH_NVCC%/bin/nvcc} + ;; + --with-cuda*) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda" + CUDA_PATH="${key#*=}" + ;; + --with-openmp) + KOKKOS_DEVICES="${KOKKOS_DEVICES},OpenMP" + ;; + --with-pthread) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Pthread" + ;; + --with-serial) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Serial" + ;; + --with-devices*) + DEVICES="${key#*=}" + KOKKOS_DEVICES="${KOKKOS_DEVICES},${DEVICES}" + ;; + --with-gtest*) + GTEST_PATH="${key#*=}" + ;; + --with-hwloc*) + HWLOC_PATH="${key#*=}" + ;; + --arch*) + KOKKOS_ARCH="${key#*=}" + ;; + --cxxflags*) + CXXFLAGS="${key#*=}" + ;; + --ldflags*) + LDFLAGS="${key#*=}" + ;; + --debug|-dbg) + KOKKOS_DEBUG=yes + ;; + --compiler*) + COMPILER="${key#*=}" + ;; + --help) + echo "Kokkos configure options:" + echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" + echo "" + echo "--with-cuda[=/Path/To/Cuda]: enable Cuda and set path to Cuda Toolkit" + echo "--with-openmp: enable OpenMP backend" + echo "--with-pthread: enable Pthreads backend" + echo "--with-serial: enable Serial backend" + echo "--with-devices: explicitly add a set of backends" + echo "" + echo "--arch=[OPTIONS]: set target architectures. Options are:" + echo " SNB = Intel Sandy/Ivy Bridge CPUs" + echo " HSW = Intel Haswell CPUs" + echo " KNC = Intel Knights Corner Xeon Phi" + echo " Kepler30 = NVIDIA Kepler generation CC 3.0" + echo " Kepler35 = NVIDIA Kepler generation CC 3.5" + echo " Kepler37 = NVIDIA Kepler generation CC 3.7" + echo " Maxwell50 = NVIDIA Maxwell generation CC 5.0" + echo " Power8 = IBM Power 8 CPUs" + echo "" + echo "--compiler=/Path/To/Compiler set the compiler" + echo "--debug,-dbg: enable Debugging" + echo "--cxxflags=[FLAGS] overwrite CXXFLAGS for library build and test build" + echo " This will still set certain required flags via" + echo " KOKKOS_CXXFLAGS (such as -fopenmp, --std=c++11, etc.)" + echo "--ldflags=[FLAGS] overwrite LDFLAGS for library build and test build" + echo " This will still set certain required flags via" + echo " KOKKOS_LDFLAGS (such as -fopenmp, -lpthread, etc.)" + echo "--with-gtest=/Path/To/Gtest: set path to gtest (used in unit and performance tests" + echo "--with-hwloc=/Path/To/Hwloc: set path to hwloc" + exit 0 + ;; + *) + # unknown option + ;; +esac +shift +done + +# If KOKKOS_PATH undefined, assume parent dir of this +# script is the KOKKOS_PATH +if [ -z "$KOKKOS_PATH" ]; then + KOKKOS_PATH=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) +else + # Ensure KOKKOS_PATH is abs path + KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) +fi + +KOKKOS_OPTIONS="KOKKOS_PATH=${KOKKOS_PATH}" + +if [ ${#COMPILER} -gt 0 ]; then +KOKKOS_OPTIONS="${KOKKOS_OPTIONS} CXX=${COMPILER}" +fi +if [ ${#PREFIX} -gt 0 ]; then +KOKKOS_OPTIONS="${KOKKOS_OPTIONS} PREFIX=${PREFIX}" +fi +if [ ${#KOKKOS_DEVICES} -gt 0 ]; then +KOKKOS_OPTIONS="${KOKKOS_OPTIONS} KOKKOS_DEVICES=${KOKKOS_DEVICES}" +fi +if [ ${#KOKKOS_ARCH} -gt 0 ]; then +KOKKOS_OPTIONS="${KOKKOS_OPTIONS} KOKKOS_ARCH=${KOKKOS_ARCH}" +fi +if [ ${#KOKKOS_DEBUG} -gt 0 ]; then +KOKKOS_OPTIONS="${KOKKOS_OPTIONS} KOKKOS_DEBUG=${KOKKOS_DEBUG}" +fi +if [ ${#CUDA_PATH} -gt 0 ]; then +KOKKOS_OPTIONS="${KOKKOS_OPTIONS} CUDA_PATH=${CUDA_PATH}" +fi +if [ ${#CXXFLAGS} -gt 0 ]; then +KOKKOS_OPTIONS="${KOKKOS_OPTIONS} CXXFLAGS=\"${CXXFLAGS}\"" +fi +if [ ${#LDFLAGS} -gt 0 ]; then +KOKKOS_OPTIONS="${KOKKOS_OPTIONS} LDFLAGS=\"${LDFLAGS}\"" +fi +if [ ${#GTEST_PATH} -gt 0 ]; then +KOKKOS_OPTIONS="${KOKKOS_OPTIONS} GTEST_PATH=${GTEST_PATH}" +else +GTEST_PATH=${KOKKOS_PATH}/tpls/gtest +KOKKOS_OPTIONS="${KOKKOS_OPTIONS} GTEST_PATH=${GTEST_PATH}" +fi +if [ ${#HWLOC_PATH} -gt 0 ]; then +KOKKOS_OPTIONS="${KOKKOS_OPTIONS} HWLOC_PATH=${HWLOC_PATH} KOKKOS_USE_TPLS=hwloc" +fi +mkdir core +mkdir core/unit_test +mkdir core/perf_test +mkdir containers +mkdir containers/unit_tests +mkdir containers/performance_tests +mkdir algorithms +mkdir algorithms/unit_tests +mkdir algorithms/performance_tests +mkdir example +mkdir example/fixture +mkdir example/feint +mkdir example/fenl + + +echo "Generating Makefile with options " ${KOKKOS_OPTIONS} +echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > Makefile +echo "" >> Makefile +echo "lib:" >> Makefile +echo -e "\tcd core; \\" >> Makefile +echo -e "\tmake -j -j -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_OPTIONS}" >> Makefile +echo "" >> Makefile +echo "install: lib" >> Makefile +echo -e "\tcd core; \\" >> Makefile +echo -e "\tmake -j -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_OPTIONS} install" >> Makefile +echo "" >> Makefile +echo "build-test:" >> Makefile +echo -e "\tcd core/unit_test; \\" >> Makefile +echo -e "\tmake -j -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_OPTIONS}" >> Makefile +echo -e "\tcd core/perf_test; \\" >> Makefile +echo -e "\tmake -j -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_OPTIONS}" >> Makefile +echo -e "\tcd containers/unit_tests; \\" >> Makefile +echo -e "\tmake -j -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_OPTIONS}" >> Makefile +echo -e "\tcd containers/performance_tests; \\" >> Makefile +echo -e "\tmake -j -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_OPTIONS}" >> Makefile +echo -e "\tcd algorithms/unit_tests; \\" >> Makefile +echo -e "\tmake -j -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_OPTIONS}" >> Makefile +echo -e "\tcd example/fixture; \\" >> Makefile +echo -e "\tmake -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_OPTIONS}" >> Makefile +echo -e "\tcd example/feint; \\" >> Makefile +echo -e "\tmake -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_OPTIONS}" >> Makefile +echo -e "\tcd example/fenl; \\" >> Makefile +echo -e "\tmake -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_OPTIONS}" >> Makefile +echo "" >> Makefile +echo "test: build-test" >> Makefile +echo -e "\tcd core/unit_test; \\" >> Makefile +echo -e "\tmake -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_OPTIONS} test" >> Makefile +echo -e "\tcd core/perf_test; \\" >> Makefile +echo -e "\tmake -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_OPTIONS} test" >> Makefile +echo -e "\tcd containers/unit_tests; \\" >> Makefile +echo -e "\tmake -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_OPTIONS} test" >> Makefile +echo -e "\tcd containers/performance_tests; \\" >> Makefile +echo -e "\tmake -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_OPTIONS} test" >> Makefile +echo -e "\tcd algorithms/unit_tests; \\" >> Makefile +echo -e "\tmake -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_OPTIONS} test" >> Makefile +echo -e "\tcd example/fixture; \\" >> Makefile +echo -e "\tmake -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_OPTIONS} test" >> Makefile +echo -e "\tcd example/feint; \\" >> Makefile +echo -e "\tmake -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_OPTIONS} test" >> Makefile +echo -e "\tcd example/fenl; \\" >> Makefile +echo -e "\tmake -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_OPTIONS} test" >> Makefile + +